Resources Used: \ nltk preprocessing https://colab.research.google.com/github/gal-a/blog/blob/master/docs/notebooks/nlp/nltk_preprocess.ipynb#scrollTo=0JzUMH4jdXm7 \ towards data science, getting started with text analysis in python https://towardsdatascience.com/getting-started-with-text-analysis-in-python-ca13590eb4f7 \ geeks for geeks text analysis in python 3 https://www.geeksforgeeks.org/text-analysis-in-python-3/ \ towards ai, text mining in python https://towardsai.net/p/data-mining/text-mining-in-python-steps-and-examples-78b3f8fd913b \ github, python for text analysis course https://github.com/cltl/python-for-text-analysis \ a beginners guide to sentiment analysis https://towardsdatascience.com/a-beginners-guide-to-sentiment-analysis-in-python-95e354ea84f6
# setup
#!pip install -q wordcloud
import wordcloud
import nltk
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
import pandas as pd
import matplotlib.pyplot as plt
import io
import unicodedata
import numpy as np
import re
import string
from collections import Counter
# check if gpu available for processing
from tensorflow.python.client import device_lib
def get_available_devices():
local_device_protos = device_lib.list_local_devices()
return [x.name for x in local_device_protos]
print(get_available_devices())
import tensorflow as tf
tf.config.list_physical_devices('GPU')
tf.test.is_built_with_cuda()
['/device:CPU:0']
True
# constants and strings
# POS (Parts Of Speech) for: nouns, adjectives, verbs and adverbs
DI_POS_TYPES = {'NN':'n', 'JJ':'a', 'VB':'v', 'RB':'r'}
POS_TYPES = list(DI_POS_TYPES.keys())
# constraints on tokens
MIN_STR_LEN = 2
RE_VALID = '[a-zA-Z0-9]'
# constraints for memory usage
ITER = 0
SAMPLE_STEP = 5
# read data from source
review_col_list = ["UserID", "ProductID", "Date", "Review"]
dfReviews = pd.read_csv("../YelpData/YelpNYC/ReviewMap.csv", usecols=review_col_list)
ratings_col_list = ["UserID", "ProductID", "StarRating"]
dfRatings = pd.read_csv("../YelpData/YelpNYC/starRatingMap.csv", usecols=ratings_col_list)
ratings_col_list = ["UserID", "ProductID", "FakeReview"]
dfMeta = pd.read_csv("../YelpData/YelpNYC/metaData.csv", usecols=ratings_col_list)
dfReviews = pd.merge(dfReviews, dfRatings, how = 'inner', left_on = ["UserID", "ProductID"], right_on = ["UserID", "ProductID"])
print(df.describe())
print(df.head(10))
df = pd.merge(dfMeta, dfReviews, how = 'inner', left_on = ["UserID", "ProductID"], right_on = ["UserID", "ProductID"])
print(df.describe())
print(df.head(10))
stopwords = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()
# remove accents function
def remove_accents(data):
return ''.join(x for x in unicodedata.normalize('NFKD', data) if x in string.ascii_letters or x == " ")
UserID ProductID FakeReview count 359052.000000 359052.000000 359052.000000 mean 53992.205533 459.929601 0.794542 std 45806.707721 259.923732 0.607210 min 923.000000 0.000000 -1.000000 25% 13840.000000 247.000000 1.000000 50% 40523.000000 468.000000 1.000000 75% 87314.000000 672.000000 1.000000 max 161147.000000 922.000000 1.000000 UserID ProductID Date \ 0 923 0 12/8/2014 1 924 0 5/16/2013 2 925 0 7/1/2013 3 926 0 7/28/2011 4 927 0 11/1/2010 5 928 0 9/2/2009 6 929 0 8/25/2009 7 930 0 5/20/2007 8 931 0 12/27/2005 9 932 0 5/9/2014 Review FakeReview 0 The food at snack is a selection of popular Gr... -1 1 This little place in Soho is wonderful. I had ... -1 2 ordered lunch for 15 from Snack last Friday. ... -1 3 This is a beautiful quaint little restaurant o... -1 4 Snack is great place for a casual sit down lu... -1 5 A solid 4 stars for this greek food spot. If ... -1 6 Let me start with a shout-out to everyone who ... -1 7 Love this place! Try the Chicken sandwich or ... -1 8 My friend and I were intrigued by the nightly ... -1 9 Stopped in for lunch today and couldn't believ... -1 UserID ProductID FakeReview StarRating count 359052.000000 359052.000000 359052.000000 359052.000000 mean 53992.205533 459.929601 0.794542 4.025871 std 45806.707721 259.923732 0.607210 1.055061 min 923.000000 0.000000 -1.000000 1.000000 25% 13840.000000 247.000000 1.000000 4.000000 50% 40523.000000 468.000000 1.000000 4.000000 75% 87314.000000 672.000000 1.000000 5.000000 max 161147.000000 922.000000 1.000000 5.000000 UserID ProductID FakeReview Date \ 0 30262 468 1 10/20/2004 1 107234 510 1 11/2/2004 2 19015 142 1 12/9/2004 3 116117 708 1 3/2/2005 4 59929 454 1 3/7/2005 5 12087 482 1 3/11/2005 6 88647 444 1 3/13/2005 7 25179 80 1 3/19/2005 8 4912 120 1 3/24/2005 9 25178 363 1 3/31/2005 Review StarRating 0 Excellent Soup Dumplings. It's a must if you g... 4 1 One of the best hidden no-name neighborhood pl... 4 2 Really lovely Italian food, very simple and we... 5 3 Mario Batali at his best, this is my current f... 5 4 Best place for brunch if you can handle the wa... 5 5 This cozy, causal restaurant is localed in the... 3 6 Take a bottle of wine, order the mussels, soak... 5 7 moto is circa 1938, dusky mirrors and heavy cu... 5 8 after all the hype i gotta say that some of it... 3 9 If you want to feel like you're in the middle ... 5
# build sentiment into table
df['Sentiment'] = df['StarRating'].map({1 : -1, 2 : -1, 3 : 0, 4 : +1, 5 : +1})
print(df.head(10))
#separate sentiment into 3 separate frames
dfPositive = df[df['Sentiment'] == 1]
dfNeutral = df[df['Sentiment'] == 0]
dfNegative = df[df['Sentiment'] == -1]
print(dfPositive.describe())
print(dfNeutral.describe())
print(dfNegative.describe())
UserID ProductID FakeReview Date \ 0 30262 468 1 10/20/2004 1 107234 510 1 11/2/2004 2 19015 142 1 12/9/2004 3 116117 708 1 3/2/2005 4 59929 454 1 3/7/2005 5 12087 482 1 3/11/2005 6 88647 444 1 3/13/2005 7 25179 80 1 3/19/2005 8 4912 120 1 3/24/2005 9 25178 363 1 3/31/2005 Review StarRating Sentiment 0 Excellent Soup Dumplings. It's a must if you g... 4 1 1 One of the best hidden no-name neighborhood pl... 4 1 2 Really lovely Italian food, very simple and we... 5 1 3 Mario Batali at his best, this is my current f... 5 1 4 Best place for brunch if you can handle the wa... 5 1 5 This cozy, causal restaurant is localed in the... 3 0 6 Take a bottle of wine, order the mussels, soak... 5 1 7 moto is circa 1938, dusky mirrors and heavy cu... 5 1 8 after all the hype i gotta say that some of it... 3 0 9 If you want to feel like you're in the middle ... 5 1 UserID ProductID FakeReview StarRating Sentiment count 276407.000000 276407.000000 276407.000000 276407.000000 276407.0 mean 54546.945714 458.374734 0.799180 4.510685 1.0 std 45782.341084 260.028822 0.601093 0.499887 0.0 min 923.000000 0.000000 -1.000000 4.000000 1.0 25% 14302.500000 247.000000 1.000000 4.000000 1.0 50% 41479.000000 468.000000 1.000000 5.000000 1.0 75% 87907.500000 671.000000 1.000000 5.000000 1.0 max 161147.000000 922.000000 1.000000 5.000000 1.0 UserID ProductID FakeReview StarRating Sentiment count 47646.000000 47646.000000 47646.000000 47646.0 47646.0 mean 43767.620661 461.180876 0.866809 3.0 0.0 std 42529.124524 259.768302 0.498645 0.0 0.0 min 923.000000 0.000000 -1.000000 3.0 0.0 25% 9524.000000 247.000000 1.000000 3.0 0.0 50% 27353.000000 465.000000 1.000000 3.0 0.0 75% 68520.250000 672.000000 1.000000 3.0 0.0 max 161134.000000 922.000000 1.000000 3.0 0.0 UserID ProductID FakeReview StarRating Sentiment count 34999.000000 34999.000000 34999.000000 34999.000000 34999.0 mean 63530.378097 470.505843 0.659533 1.593588 -1.0 std 47690.882281 259.055087 0.751686 0.491170 0.0 min 923.000000 0.000000 -1.000000 1.000000 -1.0 25% 20004.500000 251.000000 1.000000 1.000000 -1.0 50% 54501.000000 468.000000 1.000000 2.000000 -1.0 75% 100593.000000 688.000000 1.000000 2.000000 -1.0 max 161122.000000 922.000000 1.000000 2.000000 -1.0
# build fake/real into separate frames
dfPosFake = dfPositive[dfPositive['FakeReview'] == -1]
dfNegFake = dfNegative[dfNegative['FakeReview'] == -1]
dfPosReal = dfPositive[dfPositive['FakeReview'] == 1]
dfNegReal = dfNegative[dfNegative['FakeReview'] == 1]
print(dfPosReal.describe())
print(dfNegReal.describe())
print(dfPosFake.describe())
print(dfNegFake.describe())
UserID ProductID FakeReview StarRating Sentiment count 248653.000000 248653.000000 248653.0 248653.000000 248653.0 mean 51907.433777 457.944244 1.0 4.499294 1.0 std 44874.485078 260.363084 0.0 0.500001 0.0 min 937.000000 0.000000 1.0 4.000000 1.0 25% 13409.000000 247.000000 1.0 4.000000 1.0 50% 38068.000000 468.000000 1.0 4.000000 1.0 75% 82866.000000 672.000000 1.0 5.000000 1.0 max 161147.000000 922.000000 1.0 5.000000 1.0 UserID ProductID FakeReview StarRating Sentiment count 29041.000000 29041.000000 29041.0 29041.000000 29041.0 mean 58941.541820 470.547226 1.0 1.633002 -1.0 std 46871.256419 259.069635 0.0 0.481994 0.0 min 940.000000 0.000000 1.0 1.000000 -1.0 25% 16825.000000 251.000000 1.0 1.000000 -1.0 50% 47844.000000 468.000000 1.0 2.000000 -1.0 75% 93886.000000 688.000000 1.0 2.000000 -1.0 max 161122.000000 922.000000 1.0 2.000000 -1.0 UserID ProductID FakeReview StarRating Sentiment count 27754.000000 27754.000000 27754.0 27754.000000 27754.0 mean 78194.800497 462.231570 -1.0 4.612741 1.0 std 47030.095529 256.987184 0.0 0.487133 0.0 min 923.000000 0.000000 -1.0 4.000000 1.0 25% 37674.250000 247.000000 -1.0 4.000000 1.0 50% 78376.500000 468.000000 -1.0 5.000000 1.0 75% 118765.750000 666.000000 -1.0 5.000000 1.0 max 161047.000000 922.000000 -1.0 5.000000 1.0 UserID ProductID FakeReview StarRating Sentiment count 5958.000000 5958.000000 5958.0 5958.000000 5958.0 mean 85897.681605 470.304129 -1.0 1.401477 -1.0 std 45272.502481 259.005809 0.0 0.490238 0.0 min 923.000000 1.000000 -1.0 1.000000 -1.0 25% 48264.250000 250.250000 -1.0 1.000000 -1.0 50% 88427.500000 466.000000 -1.0 1.000000 -1.0 75% 126741.750000 688.000000 -1.0 2.000000 -1.0 max 161111.000000 922.000000 -1.0 2.000000 -1.0
# process reviews by removing stopwords
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []
for review in df['Review']:
ITER += 1
if ITER % SAMPLE_STEP != 1:
continue
# tokenize and lowercase each review
tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]
# process tokens in each review
tokensFromReview = []
lemmaTokensFromReview = []
for token in tokens:
# remove accent marks and punctuation
newToken = remove_accents(token)
newToken = str(newToken).translate(string.punctuation)
tokensFromReview.append(newToken)
# add representation to Lemma for non-match (will be removed when match found)
lemmaTokensFromReview.append("-")
# process newToken to remove stopwords and get word counts
if newToken not in stopwords:
if re.search(RE_VALID, newToken):
if(len(newToken)) >= MIN_STR_LEN:
# parse token as part of speech, with default to noun
pos = nltk.pos_tag([newToken])[0][1][:2]
pos2 = 'n'
if pos in DI_POS_TYPES:
pos2 = DI_POS_TYPES[pos]
stem = stemmer.stem(newToken)
lem = lemmatizer.lemmatize(newToken, pos = pos2)
if pos in POS_TYPES:
reviewsTokens.append((newToken, stem, lem, pos))
lemmaTokensFromReview = lemmaTokensFromReview[:-1]
lemmaTokensFromReview.append(lem)
# build reviews lists
reviewsTokenLists.append(tokensFromReview)
stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
reviewsLemmaStrings.append(stringlemmaTokensFromReview)
# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)
print("dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()
# replace null with empty string
for token in dfTokens:
if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
dfTokens[token].fillna(value = '', inplace = True)
dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("dfLemma.head(10): ")
print(dfLemma.head(10).to_string())
dfTokens.head(10): 0 0 great 1 food 2 - 3 great 4 drink 5 - 6 - 7 - 8 even 9 pair dfLemma.head(10): lem 0 - food - snack - - selection - popular greek dish - - appetizer tray - good - - - greek salad - - - underwhelmed - - main course - - - - table - - - - sometimes hard - get seat - 1 - solid - star - - greek food spot - - - - - fan - lamb - - - - - - come - - try - lamb sandwich - amazingly tender - juicy - onion - arugula - also - - good greek salad - 2 pretty cool place - good food - good people 3 - - - braise lamb sandwich - - - - - best sandwich - - life - - - - favour - try - place - friendly service - cosy atmosphere - 4 - good big greek cooking - - come - city - - gorgeous sunday - - brutal winter - - - first clear sunny crisp sunday - walk - soho - - - fav - - - - - - - hungry - decide - try - hole - - wall gem - literally - hole - - wall - - think - perfect - believe - - - table - - - - small - restroom - - - hall - - food - delicious - - - hummus - warm pita - lamb stew - fresh - - perfect - pastitsio - sp - - - perfect - portion - - enough - - dim - light lit candle - - - - perfect way - end - sunday - full tummy - wine - - real gem - - service - good hard - - - - - - small place - - feel - - - someone - home - - guest - - cooking - - home good - 5 - food - amaze - - service - equally amaze - - friend - - - definitely come back - - place - 6 - - - - - notice - - - - - review - - - - - healthiest eater - - - - try - - snack - - - best greek salad - - ever taste - big juicy tomato - crunchy fresh cucumber - fantastic olive oil dress - - - nt eat greek salad typcially - - - - eat - - snack - actually - - crave - - visting new york - - - try - - 7 - taramosalata - - die - - - recommend - shrimp santorini - also - - good friend - - greek love - restaurant - say - taste - authentic - 8 - tiny cafe - thompson - - - favorite - mine - year - - - - tell - - - everything - fresh - - attention - detail make - - keeper - - lamb sammie - ciabatta - melt - - mouth chunk - lamb - - roast onion - pretty much - die - - - - big enough - - gal - share - - sure - - favorite soup - - time - - rock - - - avgolemono - super lemony - perfect - - al dente orzo - serve - toast sliver - olive oil coat fresh bread - - - - take away bag - toss little twist - waxed paper fill - - - jordan almond - pretty adorable - - - - - sucker - - little touch - 9 really delicious sandwich - - lamb - - - enormous - - - able - eat - - - meal - tight - - - - - recommend grab - go - definitely - neat block - visit - lunch - - ever get bore - sullivan st - - accept credit card -
# sum of counts
print("Group by lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()
print("Get just the first row in each lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()
print("Top 10 words by part of speech:")
for pos in POS_TYPES:
dfPartofSpeech = dfWords[dfWords['pos'] == pos]
print()
print("POS_TYPE:", pos)
print(dfPartofSpeech.head(10).to_string())
Group by lemma'd words, add count and sort: Get just the first row in each lemma'd group dfWords.head(10): lem index token stem pos counts 0 nt 163 nt nt NN 54697 1 place 41 place place NN 50843 2 food 0 food food NN 49892 3 good 8 good good JJ 48059 4 get 17 get get VB 42860 5 go 266 going go VB 39381 6 great 389 great great JJ 34853 7 come 26 come come VB 31299 8 order 458 order order NN 30485 9 time 222 time time NN 25805 Top 10 words by part of speech: POS_TYPE: NN lem index token stem pos counts 0 nt 163 nt nt NN 54697 1 place 41 place place NN 50843 2 food 0 food food NN 49892 8 order 458 order order NN 30485 9 time 222 time time NN 25805 13 wait 438 wait wait NN 22995 14 try 27 try tri NN 22476 15 service 56 service servic NN 21731 17 restaurant 186 restaurant restaur NN 19958 19 love 185 loves love NN 18815 POS_TYPE: JJ lem index token stem pos counts 3 good 8 good good JJ 48059 6 great 389 great great JJ 34853 18 delicious 94 delicious delici JJ 19434 22 best 49 best best JJ 16375 33 nice 338 nice nice JJ 14203 34 little 243 little littl JJ 14056 45 much 213 much much JJ 11175 46 small 90 small small JJ 11139 62 fresh 100 fresh fresh JJ 9068 80 new 173 new new JJ 7892 POS_TYPE: VB lem index token stem pos counts 4 get 17 get get VB 42860 5 go 266 going go VB 39381 7 come 26 come come VB 31299 12 make 201 makes make VB 23026 27 say 187 says say VB 15619 31 taste 152 tasted tast VB 14506 32 fry 876 fried fri VB 14325 35 take 239 take take VB 13754 39 amaze 133 amazing amaz VB 12150 41 give 1093 gave gave VB 11647 POS_TYPE: RB lem index token stem pos counts 10 really 255 really realli RB 24657 11 well 307 better better RB 23251 16 back 140 back back RB 21142 20 also 35 also also RB 18654 28 even 511 even even RB 14871 36 definitely 138 definitely definit RB 13372 42 pretty 39 pretty pretti RB 11560 59 first 69 first first RB 9568 65 always 1977 always alway RB 8927 77 friendly 55 friendly friendli RB 8072
# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("flatTokensList[:10]:", flatTokensList[:10])
print()
freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()
freqDist.plot(30, cumulative=False)
print()
#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
freqDist2.plot(30, cumulative=False)
flatTokensList[:10]: ['the', 'food', 'at', 'snack', 'is', 'a', 'selection', 'of', 'popular', 'greek'] Frequency Distribution of all words[:30]: [('the', 462518), ('and', 284830), ('i', 238418), ('a', 224568), ('to', 172121), ('was', 159774), ('it', 139586), ('of', 134772), ('is', 111390), ('for', 99904), ('in', 91479), ('with', 81477), ('but', 79300), ('that', 73170), ('we', 69666), ('you', 69586), ('this', 64822), ('my', 63984), ('on', 57278), ('s', 54730), ('nt', 54697), ('had', 51391), ('not', 49412), ('were', 49195), ('food', 49084), ('they', 48617), ('good', 47878), ('so', 46517), ('place', 45590), ('have', 42379)]
Frequency Distribution of lemma[:30]: [('nt', 54697), ('place', 50843), ('food', 49892), ('good', 48059), ('get', 42860), ('go', 39381), ('great', 34853), ('come', 31299), ('order', 30485), ('time', 25805), ('really', 24657), ('well', 23251), ('make', 23026), ('wait', 22995), ('try', 22476), ('service', 21731), ('back', 21142), ('restaurant', 19958), ('delicious', 19434), ('love', 18815), ('also', 18654), ('dish', 16481), ('best', 16375), ('table', 16216), ('eat', 15703), ('sauce', 15684), ('friend', 15674), ('say', 15619), ('even', 14871), ('menu', 14814)]
<AxesSubplot:xlabel='Samples', ylabel='Counts'>
# output frequency data to csv for further analysis in alteryx
dfwords = pd.DataFrame(FreqDistSortedWordList)
dfwords.to_csv("../YelpData/YelpNYC/freqDistAllWords.csv", encoding = 'utf-8', index = False, header = False)
dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistLemma.csv", encoding = 'utf-8', index = False, header = False)
# process reviews by removing stopwords in positive reviews
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []
for review in dfPositive['Review']:
ITER += 1
if ITER % SAMPLE_STEP != 1:
continue
# tokenize and lowercase each review
tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]
# process tokens in each review
tokensFromReview = []
lemmaTokensFromReview = []
for token in tokens:
# remove accent marks and punctuation
newToken = remove_accents(token)
newToken = str(newToken).translate(string.punctuation)
tokensFromReview.append(newToken)
# add representation to Lemma for non-match (will be removed when match found)
lemmaTokensFromReview.append("-")
# process newToken to remove stopwords and get word counts
if newToken not in stopwords:
if re.search(RE_VALID, newToken):
if(len(newToken)) >= MIN_STR_LEN:
# parse token as part of speech, with default to noun
pos = nltk.pos_tag([newToken])[0][1][:2]
pos2 = 'n'
if pos in DI_POS_TYPES:
pos2 = DI_POS_TYPES[pos]
stem = stemmer.stem(newToken)
lem = lemmatizer.lemmatize(newToken, pos = pos2)
if pos in POS_TYPES:
reviewsTokens.append((newToken, stem, lem, pos))
lemmaTokensFromReview = lemmaTokensFromReview[:-1]
lemmaTokensFromReview.append(lem)
# build reviews lists
reviewsTokenLists.append(tokensFromReview)
stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
reviewsLemmaStrings.append(stringlemmaTokensFromReview)
# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)
print("Positive dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()
# replace null with empty string
for token in dfTokens:
if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
dfTokens[token].fillna(value = '', inplace = True)
dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("Positive dfLemma.head(10): ")
print(dfLemma.head(10).to_string())
Positive dfTokens.head(10): 0 0 - 1 nt 2 say 3 enough 4 good 5 thing 6 - 7 - 8 place 9 - Positive dfLemma.head(10): lem 0 - solid - star - - greek food spot - - - - - fan - lamb - - - - - - come - - try - lamb sandwich - amazingly tender - juicy - onion - arugula - also - - good greek salad - 1 pretty cool place - good food - good people 2 - - - braise lamb sandwich - - - - - best sandwich - - life - - - - favour - try - place - friendly service - cosy atmosphere - 3 need - quick bite - stop - - - - review - - - really cute - small - - - - roast sandwich - - - good - service - - friendly - - - nice place - break - shopping 4 quick - delicious - fill - - - - hour - shopping - soho - - starve - - - nt accommodate - - - - - first - - - tiny - - - take - number - call - back - - min later - fresh ingredient - - flavor hit - - right note - - pastitsio - delicate - - hummus - creamy - - dolmades werent - dense - - tart - service - - - smile - - - definitely good - try - - mellow saturday afternoon - 5 novelty meet mediterranean meet soho - - place - - squeeze - - - - - seater table - - - - mean squeeze - - - - money spot - lunch - - quick - - go bite - go - - lamb sandwich - - dressing - fantastic flavor pairing - - - - look - something lighter - - - stomach - - wallet - try - soup - - mediterranean sandwich - full - veggie - - great variety - cute location - good food - 6 - place - tiny - - think - fit - people max - keep - - mind - consider - come - - - weekend night - - food - great - - set romantic - - recommend come - - - - - - area - - - - open seat - oh word - warn - - - guess - - standard - small shop establishment - nyc - - saw - roach - - wall - - 7 perfect - - - name implies - great butter bean salad - even well winter soup - perfect - date - - oneonone dinner - - - nt bring - part - - - - - - - - exclusively single table - seat maybe - - together - really - gem - especially - - - nt advertise - - - greek restaurant - - - food - definitely mediterranean - 8 yums - - - try - carp roe - - - sooooooooooooo good - - - - feta - tomato - - - combination platter - - jam - 9 small place big - taste - stop - - - wife - shopping - absolutely delicious - - friendly waitress -
# sum of counts
print("Group by positive lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()
print("Get just the first row in each positive lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()
print("Top 10 positive words by part of speech:")
for pos in POS_TYPES:
dfPartofSpeech = dfWords[dfWords['pos'] == pos]
print()
print("POS_TYPE:", pos)
print(dfPartofSpeech.head(10).to_string())
Group by positive lemma'd words, add count and sort: Get just the first row in each positive lemma'd group dfWords.head(10): lem index token stem pos counts 0 place 22 place place NN 38165 1 nt 64 nt nt NN 35926 2 food 3 food food NN 35534 3 good 17 good good JJ 34960 4 get 353 get get VB 31360 5 great 131 great great JJ 29575 6 go 111 go go VB 28693 7 come 7 come come VB 22294 8 order 231 ordered order VB 20162 9 time 510 time time NN 19299 Top 10 positive words by part of speech: POS_TYPE: NN lem index token stem pos counts 0 place 22 place place NN 38165 1 nt 64 nt nt NN 35926 2 food 3 food food NN 35534 9 time 510 time time NN 19299 13 try 8 try tri NN 17078 16 love 242 love love NN 16338 18 service 37 service servic NN 14922 20 restaurant 200 restaurant restaur NN 14355 22 sauce 524 sauces sauc NN 12063 23 dish 333 dishes dish NN 11573 POS_TYPE: JJ lem index token stem pos counts 3 good 17 good good JJ 34960 5 great 131 great great JJ 29575 12 delicious 58 delicious delici JJ 17378 21 best 30 best best JJ 13829 31 nice 53 nice nice JJ 10691 32 little 287 little littl JJ 10673 47 small 47 small small JJ 8115 54 fresh 74 fresh fresh JJ 7545 55 much 1379 much much JJ 7505 77 new 1295 new new JJ 6187 POS_TYPE: VB lem index token stem pos counts 4 get 353 get get VB 31360 6 go 111 go go VB 28693 7 come 7 come come VB 22294 8 order 231 ordered order VB 20162 11 make 553 makes make VB 17636 15 wait 362 waited wait VB 16345 25 eat 265 eating eat VB 11418 29 amaze 376 amazing amaz VB 11175 33 fry 3261 fried fri VB 10611 35 say 530 say say VB 10172 POS_TYPE: RB lem index token stem pos counts 10 really 45 really realli RB 17895 14 well 178 better better RB 16389 17 back 71 back back RB 15473 19 also 16 also also RB 14859 26 definitely 90 definitely definit RB 11334 34 even 177 even even RB 10330 50 pretty 20 pretty pretti RB 7684 52 always 1301 always alway RB 7641 61 first 66 first first RB 7107 65 friendly 36 friendly friendli RB 6870
# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("Positive flatTokensList[:10]:", flatTokensList[:10])
print()
freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Positive Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()
freqDist.plot(30, cumulative=False)
print()
#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Positive Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
freqDist2.plot(30, cumulative=False)
Positive flatTokensList[:10]: ['a', 'solid', '', 'stars', 'for', 'this', 'greek', 'food', 'spot', ''] Positive Frequency Distribution of all words[:30]: [('the', 339730), ('and', 215930), ('i', 169996), ('a', 166721), ('to', 121316), ('was', 107807), ('it', 100928), ('of', 100117), ('is', 86846), ('for', 71858), ('in', 68609), ('with', 62532), ('but', 54269), ('you', 54261), ('that', 50960), ('we', 47933), ('this', 47727), ('my', 46778), ('on', 41822), ('s', 40978), ('had', 37770), ('nt', 35926), ('so', 35035), ('food', 34973), ('good', 34810), ('place', 34424), ('they', 34375), ('were', 33137), ('have', 31019), ('not', 30695)]
Positive Frequency Distribution of lemma[:30]: [('place', 38165), ('nt', 35926), ('food', 35534), ('good', 34960), ('get', 31360), ('great', 29575), ('go', 28693), ('come', 22294), ('order', 20162), ('time', 19299), ('really', 17895), ('make', 17636), ('delicious', 17378), ('try', 17078), ('well', 16389), ('wait', 16345), ('love', 16338), ('back', 15473), ('service', 14922), ('also', 14859), ('restaurant', 14355), ('best', 13829), ('sauce', 12063), ('dish', 11573), ('friend', 11426), ('eat', 11418), ('definitely', 11334), ('menu', 11315), ('chicken', 11196), ('amaze', 11175)]
<AxesSubplot:xlabel='Samples', ylabel='Counts'>
# output frequency data to csv for further analysis in alteryx
dfwords = pd.DataFrame(FreqDistSortedWordList)
dfwords.to_csv("../YelpData/YelpNYC/freqDistPosWords.csv", encoding = 'utf-8', index = False, header = False)
dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistPosLemma.csv", encoding = 'utf-8', index = False, header = False)
# process reviews by removing stopwords in neutral reviews
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []
for review in dfNeutral['Review']:
ITER += 1
if ITER % SAMPLE_STEP != 1:
continue
# tokenize and lowercase each review
tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]
# process tokens in each review
tokensFromReview = []
lemmaTokensFromReview = []
for token in tokens:
# remove accent marks and punctuation
newToken = remove_accents(token)
newToken = str(newToken).translate(string.punctuation)
tokensFromReview.append(newToken)
# add representation to Lemma for non-match (will be removed when match found)
lemmaTokensFromReview.append("-")
# process newToken to remove stopwords and get word counts
if newToken not in stopwords:
if re.search(RE_VALID, newToken):
if(len(newToken)) >= MIN_STR_LEN:
# parse token as part of speech, with default to noun
pos = nltk.pos_tag([newToken])[0][1][:2]
pos2 = 'n'
if pos in DI_POS_TYPES:
pos2 = DI_POS_TYPES[pos]
stem = stemmer.stem(newToken)
lem = lemmatizer.lemmatize(newToken, pos = pos2)
if pos in POS_TYPES:
reviewsTokens.append((newToken, stem, lem, pos))
lemmaTokensFromReview = lemmaTokensFromReview[:-1]
lemmaTokensFromReview.append(lem)
# build reviews lists
reviewsTokenLists.append(tokensFromReview)
stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
reviewsLemmaStrings.append(stringlemmaTokensFromReview)
# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)
print("Neutral dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()
# replace null with empty string
for token in dfTokens:
if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
dfTokens[token].fillna(value = '', inplace = True)
dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("Neutral dfLemma.head(10): ")
print(dfLemma.head(10).to_string())
Neutral dfTokens.head(10): 0 0 - 1 pizza 2 - 3 - 4 good 5 - 6 - 7 staff 8 - 9 helpful Neutral dfLemma.head(10): lem 0 - little place - soho - wonderful - - - - lamb sandwich - - glass - wine - - price shock - - - small - serve - - - - - - - - soho - - staff - - - little snotty - rude - - - food - great - - - nt expect worldclass service - 1 nice little greek restaurant - serf authentic greek dish - - nt go - - - look - - gyro - - - type - greek food - - ever - - - - - - small - quaint restaurant - seat - total - - people - - - food - - good - - - - veal - rice dish - - - portion - - - little small - 2 decent mediterranean place - - space - - small - - - dish - fresh - - service - friendly - - little overprice - - opinion - - - - nt understand - - - - extra charge - pita - especially - soup - hummus - beet salad - - fresh - - - large - - - - beet salad - soup - also tasty - hearty - - particular - lentil soup - - - tad - - salty side - 3 - really enjoy - experience - - tiny yet tasty restaurant - - - honest - - first big - - - - - seat almost immediately - - seat restaurant - - try go past pm - - - friend - jennifer - - - - get - fava - start - - - - - tasty - - thicker texture - hummus - - - - get extra pita - free - - - also get - classic greek avgolemon - - - - first time - - traditional dish - - - soup afterwards - - forever compare - - - - jennifer - say - - - miss - chicken - - even - - - - - - flavorful dish - jennifer get - special - - day lamb dish - - - uniquely flavor - - - - opinion - - bit expensive - - quantity - - - appreciate - - - - - - table - - - - rush - - - restaurant - - - - - people start wait - seat - - tiny space - - - automatically feel guilty - stay - long - - - - great food - - - big expensiveo - - - opinion - - - glad - get - chance - try - - - 4 pro - - food - actually - good con - someone - - nt drink wine - - alcohol - - matter - - picked - wine - - list - - - nt see - - - - - - way - - order - bottle - white - - swear - taste - someone pour - half - bottle - refill - - milwaukee - best - natty lite - also - - service - - - good - pleasant - - - - strange consider - - nt possibly fit - - - people - - entire restaurant - 5 peppinos - far superior - - typical neighborhood slice shop - - - nt - destination pizza place - - staff - friendly - - fault - - almost make - - - error - - food - - - - - last - time - - - order pie - - - - - put - wrong topping - - pie - - - - - picky guy - - - annoy - - - pay - - - - - pizza - - - - sure - - issue - - - hope - get - fix - delivery service - inconsistent - delivery time - - minute - - - food - arrive hot - time - - addition - - pie - salad - sandwich - also delicious - especially - pepperoni hero - 6 - - - - twice - - get - spinach ravioli - time - - - really good - - - - - complaint - - - go - yesterday - - pm - - place - filthy - - floor - disgust - - - - wipe crumb - - - seat - 7 nice pizza restaurant - marguerita pizza - great - - expensive 8 peppino - make - - solid pizza - - ingredient - top notch - - - dough - use - particularly good - - interior - - family style appeal - great neighborhood place - - - complaint - - - price - - - tad high - - local pizza joint - - small plain pizza - - - - - - small - totonno - - 9 - - - bad review - previously post - - make - trip - - - - delightfully surprised - - service - friendly - invite - - - pizza - - thin crust - - bit soggy - - - flavor played - - mouth - - perfect concerto - - make - salieri wannabe jealous - brick oven - - family environment add great touch - - real hero - - pizza - - - make - trip - - try - pasta - - - continued
# sum of counts
print("Group by neutral lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()
print("Get just the first row in each neutral lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()
print("Top 10 neutral words by part of speech:")
for pos in POS_TYPES:
dfPartofSpeech = dfWords[dfWords['pos'] == pos]
print()
print("POS_TYPE:", pos)
print(dfPartofSpeech.head(10).to_string())
Group by neutral lemma'd words, add count and sort: Get just the first row in each neutral lemma'd group dfWords.head(10): lem index token stem pos counts 0 nt 19 nt nt NN 10076 1 good 46 good good JJ 9165 2 food 17 food food NN 7420 3 place 1 place place NN 7061 4 get 110 got got VB 6521 5 go 32 go go VB 5436 6 come 376 came came VB 4967 7 order 194 ordered order VB 4959 8 really 89 really realli RB 4240 9 well 452 better better RB 3879 Top 10 neutral words by part of speech: POS_TYPE: NN lem index token stem pos counts 0 nt 19 nt nt NN 10076 2 food 17 food food NN 7420 3 place 1 place place NN 7061 11 time 127 time time NN 3510 13 try 104 try tri NN 3334 14 service 22 service servic NN 3311 17 restaurant 26 restaurant restaur NN 2742 18 table 154 tables tabl NN 2673 19 dish 30 dishes dish NN 2658 26 friend 108 friend friend NN 2346 POS_TYPE: JJ lem index token stem pos counts 1 good 46 good good JJ 9165 12 great 18 great great JJ 3397 24 nice 23 nice nice JJ 2375 25 little 0 little littl JJ 2360 32 much 1104 much much JJ 2067 42 small 10 small small JJ 1811 55 delicious 271 delicious delici JJ 1477 58 best 205 best best JJ 1456 62 bad 330 bad bad JJ 1418 71 overall 868 overall overal JJ 1200 POS_TYPE: VB lem index token stem pos counts 4 get 110 got got VB 6521 5 go 32 go go VB 5436 6 come 376 came came VB 4967 7 order 194 ordered order VB 4959 10 wait 159 waiting wait VB 3517 16 make 235 makes make VB 2892 22 say 135 said said VB 2522 23 taste 198 tasted tast VB 2508 29 give 377 give give VB 2192 34 take 1030 took took VB 2064 POS_TYPE: RB lem index token stem pos counts 8 really 89 really realli RB 4240 9 well 452 better better RB 3879 15 back 558 back back RB 2906 20 pretty 627 pretty pretti RB 2569 21 also 80 also also RB 2523 39 even 138 even even RB 1873 51 definitely 705 definitely definit RB 1558 67 first 97 first first RB 1365 69 still 1333 still still RB 1335 75 maybe 1021 maybe mayb RB 1186
# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("Neutral flatTokensList[:10]:", flatTokensList[:10])
print()
freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Neutral Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()
freqDist.plot(30, cumulative=False)
print()
#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Neutral Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
freqDist2.plot(30, cumulative=False)
Neutral flatTokensList[:10]: ['this', 'little', 'place', 'in', 'soho', 'is', 'wonderful', '', 'i', 'had'] Neutral Frequency Distribution of all words[:30]: [('the', 70369), ('and', 38237), ('i', 37776), ('a', 33433), ('was', 29465), ('to', 25569), ('it', 22772), ('of', 19388), ('but', 15686), ('for', 15603), ('is', 14668), ('in', 12669), ('that', 11733), ('with', 11172), ('nt', 10076), ('we', 9917), ('not', 9794), ('good', 9143), ('my', 8938), ('were', 8676), ('on', 8646), ('you', 8532), ('this', 8258), ('s', 8055), ('had', 7600), ('food', 7297), ('they', 7111), ('so', 6428), ('place', 6253), ('at', 5934)]
Neutral Frequency Distribution of lemma[:30]: [('nt', 10076), ('good', 9165), ('food', 7420), ('place', 7061), ('get', 6521), ('go', 5436), ('come', 4967), ('order', 4959), ('really', 4240), ('well', 3879), ('wait', 3517), ('time', 3510), ('great', 3397), ('try', 3334), ('service', 3311), ('back', 2906), ('make', 2892), ('restaurant', 2742), ('table', 2673), ('dish', 2658), ('pretty', 2569), ('also', 2523), ('say', 2522), ('taste', 2508), ('nice', 2375), ('little', 2360), ('friend', 2346), ('chicken', 2261), ('sauce', 2212), ('give', 2192)]
<AxesSubplot:xlabel='Samples', ylabel='Counts'>
# output frequency data to csv for further analysis in alteryx
dfwords = pd.DataFrame(FreqDistSortedWordList)
dfwords.to_csv("../YelpData/YelpNYC/freqDistNeutWords.csv", encoding = 'utf-8', index = False, header = False)
dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistNeutLemma.csv", encoding = 'utf-8', index = False, header = False)
# process reviews by removing stopwords in neutral reviews
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []
for review in dfNegative['Review']:
ITER += 1
if ITER % SAMPLE_STEP != 1:
continue
# tokenize and lowercase each review
tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]
# process tokens in each review
tokensFromReview = []
lemmaTokensFromReview = []
for token in tokens:
# remove accent marks and punctuation
newToken = remove_accents(token)
newToken = str(newToken).translate(string.punctuation)
tokensFromReview.append(newToken)
# add representation to Lemma for non-match (will be removed when match found)
lemmaTokensFromReview.append("-")
# process newToken to remove stopwords and get word counts
if newToken not in stopwords:
if re.search(RE_VALID, newToken):
if(len(newToken)) >= MIN_STR_LEN:
# parse token as part of speech, with default to noun
pos = nltk.pos_tag([newToken])[0][1][:2]
pos2 = 'n'
if pos in DI_POS_TYPES:
pos2 = DI_POS_TYPES[pos]
stem = stemmer.stem(newToken)
lem = lemmatizer.lemmatize(newToken, pos = pos2)
if pos in POS_TYPES:
reviewsTokens.append((newToken, stem, lem, pos))
lemmaTokensFromReview = lemmaTokensFromReview[:-1]
lemmaTokensFromReview.append(lem)
# build reviews lists
reviewsTokenLists.append(tokensFromReview)
stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
reviewsLemmaStrings.append(stringlemmaTokensFromReview)
# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)
print("Negative dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()
# replace null with empty string
for token in dfTokens:
if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
dfTokens[token].fillna(value = '', inplace = True)
dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("Negative dfLemma.head(10): ")
print(dfLemma.head(10).to_string())
Negative dfTokens.head(10): 0 0 - 1 nt 2 bother 3 - 4 - 5 problem 6 - 7 - 8 wet 9 - Negative dfLemma.head(10): lem 0 - meaning - try - place - - whilehighly recommend - - friend - - - tuna sandwich - good - get terribly sick - word - also - sage tea - nice - 1 stop - - lunch takeout - work - - ask - server - - well - roast vegetable sandwhich - vegetable souvlaki - - reply boastfully - - vegetarian souvlaki - apparently - enjoys tzatziki sauce run - - - pita - - - hand - - - - face - unfortunately - - - - - - taste - nt make - - - mess - 2 - walk halfway - manhattan - - restaurant - - wish - - stayed - home - - - decent food - - - seem - - get - - - - corner deli - - flavor seem bland - - menu cryptic - actual ingredient - - salad - - - give kudos - pleasant wine - attentive - - - overbear service - 3 hmm - - far - - - - impressed - stop - - grab - sandwich - take - - woman - - counter clearly wish - - hurry - - make - decision - - place - order - - give - - total - - - - - want - give - exactly - amount - - - - take - - penny - make - - - cent - - cent - - oh - - penny - - - nt - penny - - - - - - - - - - - suppose - know - - - - - nt accept penny - - nt penny money - - - - take - dime - - - penny - wtf - - give - - quarter instead - - - course - get back - dime - - - - need - - change - - told - - sit - - - - sandwich - - brought - - - ready - fine - - - - - seat - anyway - sat - - - bench - tick - tock - tick - tock - - minute go - - - - - - wonder seriously - long - take - make - bloody sandwich - - - go - - ask - - - turn - - sandwich - - sit - - counter - whole time - - girl - - different - - - - - counter say - - oh - - - - - - - - - employee - talk - - - - - place - - mean - - - sit - day - - bench - - watch - yuppie family pile - dknysporting toddler - - luxury suv - - - think - order - sandwich - - - hungry - - sandwich - - - turn - - - really good - lovely ciabatta bread - fresh ingredient - - order - marinate sandwich - - - allinall delightful - - - alidoro - - half - block away - - think - - give snack place - miss next time - 4 - food - average pizzeria - - cheap - add - - - fact - - puked - gut - - - bathroom - - - meal - - - - decide - - go back - 5 want - love - - look great come - - fell short - - - much oilgrease - good amount - cheese - sauce - even - crust - - good texture - - - soft - - - crispy - char - - - - flavor - - star - - sure - experienced well slice - - - star - friendly service - - - - - slicesoda recession deal - - nice owner - - say - 6 food - creative - thought provoke - - make sure - eat - - go - portion - ridiculously small - left - feel shortchanged - hungry - - - celery oyster stew - - know - - brooklyn - - - - - potato - oyster cracker filler cost - much - - stew scarcely cover - bottom - - cavernous bowl - conversation - dinner tend - echo - - expose side - say bowl - heard - story twice - dekalb - maybe focus less - furbish - wall - - restaurant - - - - furbish - wall - - bowl - cheapingredientyetfancynamedstew - probably - nt - - next time - p - semiredeeming quality - squash tot - good concept - - - squash - becomes - - - verb - meeting - fork - 7 - - - excite - try - place - - - - - - - block away - look packed whenever - walk - - holy schizza - - disapointed - - go - brunch - - busy sunday afternoon - - nt wait - get - food - - - pretty hungry - first - - - mimosa - lukewarm - second - - fry - lukewarm - well - taste - - - - cooked - minute ago - - din room - packed - - - - turnover - - decent - - food fresh - - egg - - omelette - - thick - - egg - fill ratio - - - - - - - - - thinly slice mushroom - - entire omelette - - half inch pancake concoction - egg - - - - service - - linger - meal - coffee - - eventually finish - cup - - see - waiter walk - - - fresh pot - coffee - - - already think - - head - - thanks - - thanks - - - - nt even get - chance - kindly pas - refill - coffee - - walk right - - - fill - lady - cup - go right back - put - pot away - maybe le parisien caught - - - bad day - mediocre food - - average service - nt belong - ny - - - good note - - - really cute - - - decor seem tres french - 8 - place - - deserve - star - - smoke salmon - old - taste horrible - - english muffin - - egg florentine - soggy - - omelette - greasy - - website make - restaurant seem upscale - - - cramped - shabby - - - - small toilet next - - kitchen - - trust yelp - want - bring - friend - - beautiful french restaurant - brunch - - - disgust - - instead - - - circumstance - - place rate - star - 9 go - - - second time - - - nt - good - - remember - order - pasta - steak frites - mussel appetizer - - drink - overall - underwhelming - total bill include tip come - - lil - - - - maybe people go - - - - - bistro - - area -
# sum of counts
print("Group by negative lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()
print("Get just the first row in each negative lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()
print("Top 10 negative words by part of speech:")
for pos in POS_TYPES:
dfPartofSpeech = dfWords[dfWords['pos'] == pos]
print()
print("POS_TYPE:", pos)
print(dfPartofSpeech.head(10).to_string())
Group by negative lemma'd words, add count and sort: Get just the first row in each negative lemma'd group dfWords.head(10): lem index token stem pos counts 0 nt 43 nt nt NN 8346 1 food 54 food food NN 6852 2 place 2 place place NN 5643 3 get 9 got got VB 5069 4 go 141 go go VB 5030 5 order 89 order order NN 4608 6 good 8 good good JJ 4262 7 come 223 coming come VB 3914 8 service 73 service servic NN 3241 9 time 156 time time NN 3218 Top 10 negative words by part of speech: POS_TYPE: NN lem index token stem pos counts 0 nt 43 nt nt NN 8346 1 food 54 food food NN 6852 2 place 2 place place NN 5643 5 order 89 order order NN 4608 8 service 73 service servic NN 3241 9 time 156 time time NN 3218 10 wait 342 wait wait NN 3145 13 restaurant 49 restaurant restaur NN 2862 14 table 586 table tabl NN 2812 21 try 1 try tri NN 2119 POS_TYPE: JJ lem index token stem pos counts 6 good 8 good good JJ 4262 23 bad 422 bad bad JJ 2051 29 great 222 great great JJ 1703 33 much 226 much much JJ 1571 57 nice 16 nice nice JJ 1129 61 small 264 small small JJ 1083 70 little 525 little littl JJ 922 73 best 638 best best JJ 907 85 many 767 many mani JJ 799 92 next 204 next next JJ 769 POS_TYPE: VB lem index token stem pos counts 3 get 9 got got VB 5069 4 go 141 go go VB 5030 7 come 223 coming come VB 3914 11 say 160 says say VB 2938 17 make 44 make make VB 2517 19 take 80 take take VB 2326 20 ask 21 asked ask VB 2163 22 give 67 give give VB 2069 25 want 92 wanted want VB 1947 31 know 106 know know VB 1589 POS_TYPE: RB lem index token stem pos counts 12 well 23 better better RB 2891 15 back 122 back back RB 2587 16 really 183 really realli RB 2521 18 even 232 even even RB 2349 34 never 613 never never RB 1555 37 also 13 also also RB 1471 46 first 347 first first RB 1334 62 pretty 345 pretty pretti RB 1065 71 maybe 298 maybe mayb RB 921 72 still 649 still still RB 916
# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("Negative flatTokensList[:10]:", flatTokensList[:10])
print()
freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Negative Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()
freqDist.plot(30, cumulative=False)
print()
#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Negative Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
freqDist2.plot(30, cumulative=False)
Negative flatTokensList[:10]: ['been', 'meaning', 'to', 'try', 'this', 'place', 'for', 'a', 'whilehighly', 'recommended'] Negative Frequency Distribution of all words[:30]: [('the', 53142), ('and', 30653), ('i', 29513), ('to', 24624), ('a', 23473), ('was', 22439), ('it', 16114), ('of', 15044), ('for', 11857), ('we', 11046), ('that', 10699), ('in', 10512), ('is', 10036), ('but', 9389), ('not', 8920), ('this', 8353), ('nt', 8346), ('my', 8013), ('with', 7458), ('were', 7155), ('you', 6863), ('they', 6796), ('food', 6761), ('on', 6700), ('had', 6235), ('at', 6051), ('s', 5612), ('so', 5521), ('have', 5212), ('place', 4960)]
Negative Frequency Distribution of lemma[:30]: [('nt', 8346), ('food', 6852), ('place', 5643), ('get', 5069), ('go', 5030), ('order', 4608), ('good', 4262), ('come', 3914), ('service', 3241), ('time', 3218), ('wait', 3145), ('say', 2938), ('well', 2891), ('restaurant', 2862), ('table', 2812), ('back', 2587), ('really', 2521), ('make', 2517), ('even', 2349), ('take', 2326), ('ask', 2163), ('try', 2119), ('give', 2069), ('bad', 2051), ('taste', 2034), ('want', 1947), ('eat', 1934), ('people', 1803), ('friend', 1768), ('great', 1703)]
<AxesSubplot:xlabel='Samples', ylabel='Counts'>
# output frequency data to csv for further analysis in alteryx
dfwords = pd.DataFrame(FreqDistSortedWordList)
dfwords.to_csv("../YelpData/YelpNYC/freqDistNegWords.csv", encoding = 'utf-8', index = False, header = False)
dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistNegLemma.csv", encoding = 'utf-8', index = False, header = False)
# process reviews by removing stopwords in neutral reviews
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []
for review in dfPosReal['Review']:
ITER += 1
if ITER % SAMPLE_STEP != 1:
continue
# tokenize and lowercase each review
tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]
# process tokens in each review
tokensFromReview = []
lemmaTokensFromReview = []
for token in tokens:
# remove accent marks and punctuation
newToken = remove_accents(token)
newToken = str(newToken).translate(string.punctuation)
tokensFromReview.append(newToken)
# add representation to Lemma for non-match (will be removed when match found)
lemmaTokensFromReview.append("-")
# process newToken to remove stopwords and get word counts
if newToken not in stopwords:
if re.search(RE_VALID, newToken):
if(len(newToken)) >= MIN_STR_LEN:
# parse token as part of speech, with default to noun
pos = nltk.pos_tag([newToken])[0][1][:2]
pos2 = 'n'
if pos in DI_POS_TYPES:
pos2 = DI_POS_TYPES[pos]
stem = stemmer.stem(newToken)
lem = lemmatizer.lemmatize(newToken, pos = pos2)
if pos in POS_TYPES:
reviewsTokens.append((newToken, stem, lem, pos))
lemmaTokensFromReview = lemmaTokensFromReview[:-1]
lemmaTokensFromReview.append(lem)
# build reviews lists
reviewsTokenLists.append(tokensFromReview)
stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
reviewsLemmaStrings.append(stringlemmaTokensFromReview)
# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)
print("positive real dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()
# replace null with empty string
for token in dfTokens:
if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
dfTokens[token].fillna(value = '', inplace = True)
dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("positive real dfLemma.head(10): ")
print(dfLemma.head(10).to_string())
positive real dfTokens.head(10): 0 0 holy 1 god 2 - 3 - 4 - 5 live 6 - 7 nyc 8 - 9 - positive real dfLemma.head(10): lem 0 best place - brunch - - - handle - wait - definitely get - mac - cheese - 1 freeman - - - hyped - belief - - - - bush twin rumor - - - - - - - still good food - - devilsonhorseback - prune stuffed - stilton cheese wrap - bacon - - lovely - - mac - cheese - - - someone - grandma make - - - winebytheglass choice - good - everything - pretty affordable - everyone say - - impossible - get - table - - - - monday - - - place - empty - - waitstaff - really friendly - helpful - - 2 cozy little greek place - love - meze - highly recommend - skordalia - potato garlic puree - - - tzatziki - cucumber yogurt dip - - - also - - - traditional dish - - think - greek food - spanakopitakia - spinach pie - pastitsio - mousaka - - nt forget - greek dessert - yogurt - - honey - - course - - halva - baklava - 3 pylos honor - breadth - traditional greek cuisine - bring fresh cooking - - region - greece - - elegant - contemporary - comfortable set - - east village - next time - - - new york - - nt miss - hidden jewel - - owner christos - - classy man - - take care - - - - - - - best friend - pylosrestaurantcom 4 go - - sunday brunch - - - visit - - - - satisfy meal - - egg - toast - home potato - - nice thick bacon - slice - chocolatepeanut butter cream pie share - - - - end - - nice topper - - impressed - - quality buttermilk biscuit - - table - seat - - greatly need - - long wait - make - starve - bit pricey - - wait detract - star - - - - definately recommend - - lazy sunday meal - 5 come - - buffalo wing - stay - - catfish burger - also feature - buffalo - ny favorite - beef - weck - hockey - ever - - - - - tv - good fry - serve - chipotle mayo - 6 expert - polenta - homemade hummus - chicken - brie sandwich - - pretty much everything else - bonnie - - reliably excellent food - - - loses point - - occasional excessively greasy sirloin burger - - - - - fool - - put - chipotle mayo - - - fry - scarf - - - - - also - amaze beer selection - 7 best espresso - nyc - - - good friend google agrees - - - - - - go - - little - - - - nyc visit friend - - buddy josh - - foodie - - espresso naziconnisseur - show - - new spot - - - tell - - - - want - best espresso - town - - - pay - - cab - let - go - - - - tell - - - walk - - joint - - block away - - - - way - tell - - acidity - foam - temperature - - - - - - sit - - - charm caferestaurant - - thoroughly impressed - - place smell italian - - - tough - explain - old place - europe - - funky smell - - remember forever - - joint - - import - euro stank - authenticity - - work - - - charm - - - pant - - espresso - - best - - ever - - - - drank - share - - - - - acidic - linger - - - mouth - - deep richness - - get home - - - forgotten - name - - amaze cafe - - - know - much - - critic josh - - - google - best espresso nyc - - low - behold - - first result - - quadronno - check - - website - - - see - - sat right - - viking mural - 8 locate - - heart - manhattan - theater district - carmine - - - pack - - - - open - june - - - - - pasta - sound steep - - everything - - primarily - - southern region - italy - - serve family style - - huge platter overflow - food - - need - come - - - empty stomach - - willingness - set aside - diet regime - - day - - carmine - chef - stuff - - - typical italian mother - believe - - - - - waiter - actually advise - - cut back - - feel - - order - much food - - party - - indulge - - hot antipasto - penne - la vodka - - chicken marsala - - brought home enough leftover - - - meal - - carmine - - nt serve - - quantity - everything - fresh - deliciously season - - cooked - order - especially - pasta - - - perfectly al dente - dessert - - delicious - - - - never - room - sample - - - choice - - atmosphere - festive - - - big fat italian wedding - - - - go - - - - - hour - - - pm - - - urgent - - make reservation well - advance - - - mean week - - day - - - otherwise - wait - - table - - long - - accept reservation - - size party - pm - - - pm - - party - - - - - open - pm sunday - monday - midnight - rest - - week - - - - - perfect place - - posttheater supper - - - - nt mind go - bed - - - full stomach - - visit - new york city - complete - - meal - carmine - - - - - recommend - gem enough - 9 - nt go wrong - - - - burger - long line - poor service - - - beef - good - however - - - best - - city - see - review -
# sum of counts
print("Group by all lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()
print("Get just the first row in each negative lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()
print("Top 10 words by part of speech used in real reviews:")
for pos in POS_TYPES:
dfPartofSpeech = dfWords[dfWords['pos'] == pos]
print()
print("POS_TYPE:", pos)
print(dfPartofSpeech.head(10).to_string())
Group by all lemma'd words, add count and sort: Get just the first row in each negative lemma'd group dfWords.head(10): lem index token stem pos counts 0 place 1 place place NN 34855 1 nt 76 nt nt NN 33788 2 good 16 good good JJ 32273 3 food 17 food food NN 31760 4 get 6 get get VB 29120 5 great 661 great great JJ 26826 6 go 119 went went VB 26333 7 come 163 come come VB 21021 8 order 363 ordering order VB 19094 9 time 103 time time NN 17682 Top 10 words by part of speech used in real reviews: POS_TYPE: NN lem index token stem pos counts 0 place 1 place place NN 34855 1 nt 76 nt nt NN 33788 3 food 17 food food NN 31760 9 time 103 time time NN 17682 13 try 673 try tri NN 15837 14 wait 4 wait wait NN 15251 16 love 53 love love NN 14969 19 service 463 service servic NN 13531 20 restaurant 579 restaurants restaur NN 12877 22 sauce 824 sauce sauc NN 11567 POS_TYPE: JJ lem index token stem pos counts 2 good 16 good good JJ 32273 5 great 661 great great JJ 26826 12 delicious 396 delicious delici JJ 16104 21 best 0 best best JJ 12730 32 nice 129 nice nice JJ 9830 33 little 50 little littl JJ 9745 45 small 1282 small small JJ 7572 51 fresh 92 fresh fresh JJ 7138 54 much 192 much much JJ 7044 77 new 104 new new JJ 5627 POS_TYPE: VB lem index token stem pos counts 4 get 6 get get VB 29120 6 go 119 went went VB 26333 7 come 163 come come VB 21021 8 order 363 ordering order VB 19094 11 make 30 made made VB 16254 31 amaze 213 amazing amaz VB 9968 34 say 38 says say VB 9525 38 take 114 take take VB 8628 50 seat 146 seating seat VB 7306 55 give 1669 gives give VB 6889 POS_TYPE: RB lem index token stem pos counts 10 really 46 really realli RB 16676 15 well 413 well well RB 15039 17 back 361 back back RB 14215 18 also 65 also also RB 13760 24 definitely 5 definitely definit RB 10709 35 even 499 even even RB 9437 49 pretty 35 pretty pretti RB 7324 56 always 629 always alway RB 6806 65 first 306 first first RB 6434 72 friendly 47 friendly friendli RB 6023
# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("PosRealReview flatTokensList[:10]:", flatTokensList[:10])
print()
freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True) # sorted list
print("PosRealReview Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()
freqDist.plot(30, cumulative=False)
print()
#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True) # sorted list
print("PosRealReview Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
freqDist2.plot(30, cumulative=False)
PosRealReview flatTokensList[:10]: ['best', 'place', 'for', 'brunch', 'if', 'you', 'can', 'handle', 'the', 'wait'] PosRealReview Frequency Distribution of all words[:30]: [('the', 314511), ('and', 200258), ('i', 159158), ('a', 155092), ('to', 112463), ('was', 100859), ('it', 94501), ('of', 93623), ('is', 79486), ('for', 66962), ('in', 63505), ('with', 58824), ('but', 50548), ('you', 49955), ('that', 47654), ('we', 44251), ('this', 44214), ('my', 43657), ('on', 38881), ('s', 38550), ('had', 35069), ('nt', 33788), ('so', 32750), ('they', 32167), ('good', 32137), ('place', 31376), ('food', 31202), ('were', 30978), ('not', 28659), ('have', 28368)]
PosRealReview Frequency Distribution of lemma[:30]: [('place', 34855), ('nt', 33788), ('good', 32273), ('food', 31760), ('get', 29120), ('great', 26826), ('go', 26333), ('come', 21021), ('order', 19094), ('time', 17682), ('really', 16676), ('make', 16254), ('delicious', 16104), ('try', 15837), ('wait', 15251), ('well', 15039), ('love', 14969), ('back', 14215), ('also', 13760), ('service', 13531), ('restaurant', 12877), ('best', 12730), ('sauce', 11567), ('dish', 11249), ('definitely', 10709), ('eat', 10628), ('friend', 10589), ('menu', 10494), ('chicken', 10245), ('fry', 10192)]
<AxesSubplot:xlabel='Samples', ylabel='Counts'>
# output frequency data to csv for further analysis in alteryx
dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistPosRealRevLemma.csv", encoding = 'utf-8', index = False, header = False)
# process reviews by removing stopwords in neutral reviews
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []
for review in dfNegReal['Review']:
ITER += 1
if ITER % SAMPLE_STEP != 1:
continue
# tokenize and lowercase each review
tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]
# process tokens in each review
tokensFromReview = []
lemmaTokensFromReview = []
for token in tokens:
# remove accent marks and punctuation
newToken = remove_accents(token)
newToken = str(newToken).translate(string.punctuation)
tokensFromReview.append(newToken)
# add representation to Lemma for non-match (will be removed when match found)
lemmaTokensFromReview.append("-")
# process newToken to remove stopwords and get word counts
if newToken not in stopwords:
if re.search(RE_VALID, newToken):
if(len(newToken)) >= MIN_STR_LEN:
# parse token as part of speech, with default to noun
pos = nltk.pos_tag([newToken])[0][1][:2]
pos2 = 'n'
if pos in DI_POS_TYPES:
pos2 = DI_POS_TYPES[pos]
stem = stemmer.stem(newToken)
lem = lemmatizer.lemmatize(newToken, pos = pos2)
if pos in POS_TYPES:
reviewsTokens.append((newToken, stem, lem, pos))
lemmaTokensFromReview = lemmaTokensFromReview[:-1]
lemmaTokensFromReview.append(lem)
# build reviews lists
reviewsTokenLists.append(tokensFromReview)
stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
reviewsLemmaStrings.append(stringlemmaTokensFromReview)
# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)
print("negative real dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()
# replace null with empty string
for token in dfTokens:
if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
dfTokens[token].fillna(value = '', inplace = True)
dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("negative real dfLemma.head(10): ")
print(dfLemma.head(10).to_string())
negative real dfTokens.head(10): 0 0 - 1 first 2 experience 3 - 4 - 5 restaurant 6 - 7 get 8 - 9 - negative real dfLemma.head(10): lem 0 middle eastern cuisine - - mediocre - - ok - - - drunk - want something salty - crunchy - - - give kudos - - amount - crap - - - able - magically fit - - pitahummus - falafel - babaganoush - onion - pickle - cabbage - lettuce - saucebut - - walk - - - - - - - time - - stomach ache - - - people - - know - make - wonder - - really - - tahini - 1 - spending - - - - people - include tip - - cheap bottle - wine - - - - - - satisfied - - say - least - instead - - - left - - delicious memory - yesterday - dinner - probably - - - nt - - delicious - definitely - delicious enough - - value - - food - good - - - even close - great - - agree - - previous review - - - - - - hype - - heard - read - - restaurant - - leaf - wonder - - best - - memorable part - - meal - - complementary muffin - - next day - breakfast - - recommend - 2 pommes frites make - lot - noise - - - double fry - potato - - - - - suppose - - - - - - - - many oddly cooked - mushyontheinside fry - want - go back - - sauce - definitely worth try - - - staff - always willing - give free sample - - last time - - - - - - - liquor license - - - suggest - small deli next door - alcoholic refreshment - - go several time - try - - - - - - - end - prefer - frites - - cafe du bruxelles - - - le halle - 3 - - - hype - - - nt get - - - really disliked - place - hamburger - - flavor - - - - bun - unimpressive - burger joint - shake shack - jg melon - blow - place away - 4 - food - - good - - love - design - - din room - - open kitchen area - however - - get - little noisy - - service - - bit haphazard - 5 defintely - unique place especially - - - risotto - - portion - kind - skimpy - - price - - - get - roast chicken - asparagus - pine nut mix - - - tad bland - - sticky - perhaps - - - give - - shot - - different order - - - nt - impressed - 6 overated pizza - uneven balance - cheese - - great - - - - tourist look - - sample - institution - try grimaldi - - - want - real ny pie - 7 mediocre - - - nt see - everyone make - deal - - place - maybe - - order - bowl - berry - - meal - - try - friend meatloaf - - - ok - - heard - - breakfast - horrible - - decor - - - date - trendy - - look - - keep - clean - - service - - good - - - overall - - - nt impressed - 8 - place - overrate - overprice - overhipsterified - - - - give - kid balloon - - - - photo booth - - basement - - nt mean - - worth - visit - 9 - - nt - - - - - food suck - - service - wish - go elsewhere -
# sum of counts
print("Group by all lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()
print("Get just the first row in each negative lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()
print("Top 10 words by part of speech used in real reviews:")
for pos in POS_TYPES:
dfPartofSpeech = dfWords[dfWords['pos'] == pos]
print()
print("POS_TYPE:", pos)
print(dfPartofSpeech.head(10).to_string())
Group by all lemma'd words, add count and sort: Get just the first row in each negative lemma'd group dfWords.head(10): lem index token stem pos counts 0 nt 52 nt nt NN 7314 1 food 58 food food NN 5640 2 place 137 place place NN 4849 3 get 134 get get VB 4421 4 go 97 go go VB 4281 5 order 190 order order NN 4074 6 good 59 good good JJ 3716 7 come 264 come come VB 3465 8 time 26 times time NN 2786 9 wait 370 waited wait VB 2773 Top 10 words by part of speech used in real reviews: POS_TYPE: NN lem index token stem pos counts 0 nt 52 nt nt NN 7314 1 food 58 food food NN 5640 2 place 137 place place NN 4849 5 order 190 order order NN 4074 8 time 26 times time NN 2786 10 service 164 service servic NN 2675 13 table 373 table tabl NN 2483 14 restaurant 69 restaurant restaur NN 2359 23 taste 1400 taste tast NN 1766 24 want 6 want want NN 1723 POS_TYPE: JJ lem index token stem pos counts 6 good 59 good good JJ 3716 25 bad 437 bad bad JJ 1686 29 great 62 great great JJ 1523 32 much 420 much much JJ 1377 57 nice 1026 nice nice JJ 980 62 small 114 small small JJ 905 70 little 162 little littl JJ 807 72 best 72 best best JJ 796 86 next 78 next next JJ 684 89 many 91 many mani JJ 677 POS_TYPE: VB lem index token stem pos counts 3 get 134 get get VB 4421 4 go 97 go go VB 4281 7 come 264 come come VB 3465 9 wait 370 waited wait VB 2773 12 say 43 say say VB 2522 15 make 31 makes make VB 2310 19 take 397 take take VB 1924 20 ask 570 asked ask VB 1791 21 try 102 trying tri VB 1782 22 give 10 give give VB 1771 POS_TYPE: RB lem index token stem pos counts 11 well 363 well well RB 2531 16 really 33 really realli RB 2280 17 back 98 back back RB 2170 18 even 60 even even RB 2075 37 also 638 also also RB 1263 41 never 441 never never RB 1189 50 first 374 first first RB 1087 56 pretty 1260 pretty pretti RB 1029 66 still 433 still still RB 853 75 long 371 long long RB 779
# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("Negative RealReview flatTokensList[:10]:", flatTokensList[:10])
print()
freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Negative RealReview Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()
freqDist.plot(30, cumulative=False)
print()
#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Negative RealReview Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
freqDist2.plot(30, cumulative=False)
Negative RealReview flatTokensList[:10]: ['middle', 'eastern', 'cuisine', 'that', 's', 'mediocre', '', 'and', 'ok', 'if'] Negative RealReview Frequency Distribution of all words[:30]: [('the', 45943), ('and', 26254), ('i', 25613), ('to', 20932), ('a', 20391), ('was', 19491), ('it', 14226), ('of', 13363), ('for', 10324), ('we', 9543), ('that', 9441), ('in', 9052), ('is', 8636), ('but', 8392), ('not', 7654), ('nt', 7314), ('this', 7114), ('my', 6921), ('with', 6398), ('were', 6310), ('you', 5946), ('on', 5842), ('they', 5688), ('food', 5568), ('had', 5176), ('at', 5082), ('s', 4904), ('so', 4707), ('have', 4416), ('place', 4193)]
Negative RealReview Frequency Distribution of lemma[:30]: [('nt', 7314), ('food', 5640), ('place', 4849), ('get', 4421), ('go', 4281), ('order', 4074), ('good', 3716), ('come', 3465), ('time', 2786), ('wait', 2773), ('service', 2675), ('well', 2531), ('say', 2522), ('table', 2483), ('restaurant', 2359), ('make', 2310), ('really', 2280), ('back', 2170), ('even', 2075), ('take', 1924), ('ask', 1791), ('try', 1782), ('give', 1771), ('taste', 1766), ('want', 1723), ('bad', 1686), ('eat', 1583), ('friend', 1544), ('people', 1528), ('great', 1523)]
<AxesSubplot:xlabel='Samples', ylabel='Counts'>
# output frequency data to csv for further analysis in alteryx
dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistNegRealRevLemma.csv", encoding = 'utf-8', index = False, header = False)
# process reviews by removing stopwords in neutral reviews
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []
for review in dfPosFake['Review']:
ITER += 1
if ITER % SAMPLE_STEP != 1:
continue
# tokenize and lowercase each review
tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]
# process tokens in each review
tokensFromReview = []
lemmaTokensFromReview = []
for token in tokens:
# remove accent marks and punctuation
newToken = remove_accents(token)
newToken = str(newToken).translate(string.punctuation)
tokensFromReview.append(newToken)
# add representation to Lemma for non-match (will be removed when match found)
lemmaTokensFromReview.append("-")
# process newToken to remove stopwords and get word counts
if newToken not in stopwords:
if re.search(RE_VALID, newToken):
if(len(newToken)) >= MIN_STR_LEN:
# parse token as part of speech, with default to noun
pos = nltk.pos_tag([newToken])[0][1][:2]
pos2 = 'n'
if pos in DI_POS_TYPES:
pos2 = DI_POS_TYPES[pos]
stem = stemmer.stem(newToken)
lem = lemmatizer.lemmatize(newToken, pos = pos2)
if pos in POS_TYPES:
reviewsTokens.append((newToken, stem, lem, pos))
lemmaTokensFromReview = lemmaTokensFromReview[:-1]
lemmaTokensFromReview.append(lem)
# build reviews lists
reviewsTokenLists.append(tokensFromReview)
stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
reviewsLemmaStrings.append(stringlemmaTokensFromReview)
# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)
print("fake dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()
# replace null with empty string
for token in dfTokens:
if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
dfTokens[token].fillna(value = '', inplace = True)
dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("fake dfLemma.head(10): ")
print(dfLemma.head(10).to_string())
fake dfTokens.head(10): 0 0 - 1 place 2 - 3 recommend 4 - 5 - 6 coworker 7 - 8 come 9 - fake dfLemma.head(10): lem 0 - right - - - - - deal - - reality - - gramercy tavern - - - bar - night - - week - - eat - - tavern - - - - cocktail - great - - food - solid - - - bartender monthursday - - - - best - ever - - - drink whiskey - - seriously drink - leave la vega seriously - get - - flatiron - even - - - - - menu - also - everything - - ever heard - - service - true - - - ny - - nt - - rare - - food - - back - phenomenal - shy away - - rabit - thumper - - thing - - - - inevitably surround - - series - event dinner - anniversary - funeral - first orgasm - - always get - - something - - - meal - leave - - - prix fixe coma - - - - need - entertain - prospective inlaws - eat - front - talk trash - - bar - marvel - - - - move - many people - elegantly - - restuarant - - drink - - - best cocktail - america - - try - rare german wine - - meal - - get - - - - - - - fyi - - - - single joint - 1 - - - favorite place - - city - - pizza - - worth - hour long wait - take - seat - - bar - - - corner cubby - - drink - - - - fine - lombardi - - also - - - - place willing - serve - - magnum - wine - - really decent price - - perfect place - low key night - good friend - - - avoid delivery - - - - disappointed - single time - 2 - place really know - - - - - foodie kind - joint - quail - varities - fish - rabbit - etc - - - tapasstyle serving - great wine list - hard - go wrong - - 3 - place - great - - people - - tapa plate - - single - - - homerun - sit - watch - food - cooked make - - much well - - nt - eat - cramped tight space - - - make - experience - personal - - - wait - - - hour - - great - - - sent - - corner - bar jamon - - great experience - 4 - order - get tapa - - small table - - need - wait - least - min - - saturday night - - rest assure - - - leave - - satisfied customer - try - - - look - place - - pas - - - - - little hole - - wall - - result - leave - ask - - - - tapa - - tasty especially - patatas fritas - bread wfava bean spread - - pork sandwich - - drink - recommed - white sangria wstrawberries - - finish - night - nice dessert - cake wdulce de leche ice cream - enjoy - 5 wow - - - great restaurant - food - decor - service - top notch - try - rotisserie chicken - - fresh shrimp - - dessert area fantastic - - nt wait - - second visit - 6 feel - southern delight - gotcha - - - - - right place - - gumbo go - - fast - - finger - - - thumbsup - ready - - hot stuff - - cajun martini - - - - make - walk - - wall - - - ceiling - back - - - - side - - - - want - stay - - hangover breakfast - tickle - sore head - - best bloody mary - town - 7 - - - - numerous time - - taste menu - lunch - - service - - food - consistently excellent lot - small touch - really add - - - superb experience - meal - - little bit - - splurge - - definitely worth - - 8 - expensive - - - nice - course mealroom - - little loud - service - excellent - - - - place - royal people - huge gorgeous crystal chandelier overhead - - - live roam violinist - enjoy - food - sip - - wine - - hear - - live music 9 - food - terrific - - dinner - delicious - elegant - brunch - great - classic - super sandwich - great - brunch - family - kid - wonderful service - - - - favorite spot - - neighborhood -
# sum of counts
print("Group by all lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()
print("Get just the first row in each positive fake lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()
print("Top 10 words by part of speech used in real reviews:")
for pos in POS_TYPES:
dfPartofSpeech = dfWords[dfWords['pos'] == pos]
print()
print("POS_TYPE:", pos)
print(dfPartofSpeech.head(10).to_string())
Group by all lemma'd words, add count and sort: Get just the first row in each positive fake lemma'd group dfWords.head(10): lem index token stem pos counts 0 food 12 food food NN 3495 1 place 93 places place NN 3121 2 great 11 great great JJ 3075 3 good 122 good good JJ 2521 4 go 146 go go VB 2275 5 nt 37 nt nt NN 1926 6 get 26 get get VB 1733 7 service 34 service servic NN 1511 8 love 403 love love NN 1466 9 time 128 time time NN 1427 Top 10 words by part of speech used in real reviews: POS_TYPE: NN lem index token stem pos counts 0 food 12 food food NN 3495 1 place 93 places place NN 3121 5 nt 37 nt nt NN 1926 7 service 34 service servic NN 1511 8 love 403 love love NN 1466 9 time 128 time time NN 1427 12 restaurant 234 restaurant restaur NN 1286 14 try 83 try tri NN 1204 17 wait 99 wait wait NN 1123 20 order 179 order order NN 1020 POS_TYPE: JJ lem index token stem pos counts 2 great 11 great great JJ 3075 3 good 122 good good JJ 2521 10 best 16 best best JJ 1359 11 delicious 336 delicious delici JJ 1307 24 nice 223 nice nice JJ 845 43 new 611 new new JJ 591 49 fresh 243 fresh fresh JJ 568 54 little 199 little littl JJ 540 63 small 182 small small JJ 453 78 much 160 much much JJ 395 POS_TYPE: VB lem index token stem pos counts 4 go 146 go go VB 2275 6 get 26 get get VB 1733 13 make 159 made made VB 1208 15 come 595 come come VB 1194 21 amaze 475 amazing amaz VB 1015 25 eat 8 eating eat VB 810 34 say 1196 said said VB 664 37 taste 287 tasting tast VB 632 39 take 100 take take VB 611 59 fry 1057 fried fri VB 495 POS_TYPE: RB lem index token stem pos counts 16 back 40 back back RB 1124 18 really 114 really realli RB 1115 19 well 161 better better RB 1088 23 also 30 also also RB 878 27 always 56 always alway RB 776 30 definitely 305 definitely definit RB 728 31 even 28 even even RB 715 33 friendly 820 friendly friendli RB 670 47 atmosphere 676 atmosphere atmospher RB 576 51 ever 17 ever ever RB 558
# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("Positive FakeReview flatTokensList[:10]:", flatTokensList[:10])
print()
freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Positive FakeReview Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()
freqDist.plot(30, cumulative=False)
print()
#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Positive FakeReview Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
freqDist2.plot(30, cumulative=False)
Positive FakeReview flatTokensList[:10]: ['all', 'right', '', 'so', 'here', 's', 'the', 'deal', '', 'the'] Positive FakeReview Frequency Distribution of all words[:30]: [('the', 22041), ('and', 14820), ('i', 10484), ('a', 10136), ('to', 8143), ('is', 7093), ('was', 6140), ('it', 5870), ('of', 5704), ('for', 4617), ('in', 4475), ('with', 3610), ('you', 3475), ('this', 3460), ('food', 3458), ('we', 3059), ('great', 3052), ('but', 3029), ('my', 2927), ('place', 2897), ('that', 2846), ('good', 2509), ('on', 2400), ('had', 2380), ('s', 2362), ('are', 2317), ('they', 2309), ('have', 2279), ('so', 2129), ('very', 1947)]
Positive FakeReview Frequency Distribution of lemma[:30]: [('food', 3495), ('place', 3121), ('great', 3075), ('good', 2521), ('go', 2275), ('nt', 1926), ('get', 1733), ('service', 1511), ('love', 1466), ('time', 1427), ('best', 1359), ('delicious', 1307), ('restaurant', 1286), ('make', 1208), ('try', 1204), ('come', 1194), ('back', 1124), ('wait', 1123), ('really', 1115), ('well', 1088), ('order', 1020), ('amaze', 1015), ('pizza', 946), ('also', 878), ('nice', 845), ('eat', 810), ('friend', 785), ('always', 776), ('staff', 765), ('menu', 758)]
<AxesSubplot:xlabel='Samples', ylabel='Counts'>
# output frequency data to csv for further analysis in alteryx
dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistPosFakeRevLemma.csv", encoding = 'utf-8', index = False, header = False)
# process reviews by removing stopwords in neutral reviews
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []
for review in dfNegFake['Review']:
ITER += 1
if ITER % SAMPLE_STEP != 1:
continue
# tokenize and lowercase each review
tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]
# process tokens in each review
tokensFromReview = []
lemmaTokensFromReview = []
for token in tokens:
# remove accent marks and punctuation
newToken = remove_accents(token)
newToken = str(newToken).translate(string.punctuation)
tokensFromReview.append(newToken)
# add representation to Lemma for non-match (will be removed when match found)
lemmaTokensFromReview.append("-")
# process newToken to remove stopwords and get word counts
if newToken not in stopwords:
if re.search(RE_VALID, newToken):
if(len(newToken)) >= MIN_STR_LEN:
# parse token as part of speech, with default to noun
pos = nltk.pos_tag([newToken])[0][1][:2]
pos2 = 'n'
if pos in DI_POS_TYPES:
pos2 = DI_POS_TYPES[pos]
stem = stemmer.stem(newToken)
lem = lemmatizer.lemmatize(newToken, pos = pos2)
if pos in POS_TYPES:
reviewsTokens.append((newToken, stem, lem, pos))
lemmaTokensFromReview = lemmaTokensFromReview[:-1]
lemmaTokensFromReview.append(lem)
# build reviews lists
reviewsTokenLists.append(tokensFromReview)
stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
reviewsLemmaStrings.append(stringlemmaTokensFromReview)
# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)
print("negative fake dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()
# replace null with empty string
for token in dfTokens:
if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
dfTokens[token].fillna(value = '', inplace = True)
dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("negative fake dfLemma.head(10): ")
print(dfLemma.head(10).to_string())
negative fake dfTokens.head(10): 0 0 honestly 1 - 2 everything 3 get 4 ruin 5 - 6 - 7 service 8 - 9 - negative fake dfLemma.head(10): lem 0 overrate - something - - place irks - - - - - bland martini - - - entry - - banal menu - - - - - - - - - - nt go - - - - nt stand say - - never - - - 1 - - - - - pizza - - guess - dont know - real pizza taste - - - try difaras - thats - message - everyone - give - place - star - 2 maitre - refuse - believe - - - reservation - lunchtime - - kept interrupt - - - try - give - name - - party - - - - meet - - - - asstant try - get - - leave twice - - - let - finish - sentence - say - - - - - meet - - - ask - - - - - sure - - - - reservation - - - - business attire - - - - late lunch business meeting - - - busy - - overrun - - - never - - - rude interaction - - restaurant - - entire life - - sushi - fresh - portion small - - slew - wait staff - - much well - maitre - - - intrusive - - constantly interrupt - flow - conversation - - - - - west coast - use - fresh sushi - - - - willing - put - - - attitude - - fresh fish - - - wonder - - - chauvinistic towards woman - - - - woman - - - reservation - - get slightly well treatment - - - maitre - finally look - - reservation - - - - able - say - party - first name - - - get cut - - - - told - - - reservation - - name - - finally blurt - - party - last name - - - spell - - - - - - finally found - reservation - - - - finally show - - table - - - - hang - coat - even - - - - - patron - offer - option - - - say - - - - - - business meeting - - - - bad reception - - maitre - - - assistant - - - - - greet - guest - look - reservation - - - never return - - 3 - want - dine - amy ruth - - - last - year - finally - - opportunity yesterday - - - horrendous - - - uninspired cornbread - taste exactly - - come - - - jiffy cornbread box - - - crabcake - - chock full - artificial crabmeat - - - chicken wing - - order - - appetizer - - - - flavor - - wing - order - - local chinese restaurant - - entire meal - - disaster - - - parent - - south carolina - - spent - summer - - youth - - south - - - know southern cuisine - - - - - - - - inclined - believe - - food - - appreciate - people - - nt know anything - southernsoul cooking - - potato salad - watery - - sweet tea - nt - - - - taste - - - - brown sugar water - - - nt taste - tea - - boyfriend - short rib - - wierd color gravy - - - - - nt bad - - - - nt good - - - - say - sweet potato - - - nt yam - - ok - - guy - - door - - nice - allow - - - seat - - din partner park - car - - server - - nice - - - go - look - - authentic - southern cuisine - - - - good soul food - - past - - - - sunday dinner - - mom - house - week - - amy ruth - - compare - - - live - - - - repuation - - food - - mass produce mess - - - 4 service - - place - horrendous - - - nt - - - occasion - - - try - look past - terrible treatment - - - xiaolong bao - soup dumpling - - - - last visit - - never go back - - teacup - sticky - - - - - lipstick - - - - - - wait - minute - - appetizer - - - - - - - - group - people - - restaurant - - food - - good enough - tolerate - extremely poor service - filthy drink ware - 5 - place - typical - - village bar - boring yuppy clientele - - bridge - tunnel - type - nj - li - - look - - - type - write good review - - place - - - night - go - - played great music - include joy division - blonde redhead - arcade fire - - lcd soundsystem - - - course - crowd - - - - - nt know - - appreciate - music - - people - work - seem - know - - - - - - term - clientele - - place - infest - culturallyclueless buttondownshirt type - girl - - nt exactly looker - - place - nt big - - - - - room - dance - - - course nobody - dance - - - action - - see - drunk ugly couple make - - full view - - bar - - - - - lame type - person - actually enjoys - type - bar - head - - - east village - brooklyn - get - taste - - real nightlife experience - 6 - nt really - go - - place - - open till - - - friend - - - roast pork - duck - - 7 first - - - - real name - jorge menendez estebanzarzuela - - - travel - bogota - bistro - serf - bad chimichurri sauce - - ever taste - even bad - - ex wifesister - - come - - brother - - - - see - - - - operacion masacre - - - - - - - - memory - drown - - authentic chipotle - - ancestry - - - mango margarita - fantastic - - server - - cute red head - polish decent - - - proud people - - deliver - pitcher - great smile - - - - - - bunuelos - - taco de pescado - - french fry - - octopus - - - - - - menu - - - - - wonderful - - - - - recommend - - - taste - travel - - brother - - eat - many year - love - - - - - back - - - - different restaurant - - similar contextual - love always - roger 8 - expect much - base - - recomendations - palma - - - start - - good - - atmosphere - great - - even - - little garden - - back - - service - ok - - - reason - visit - - good food - - - - disappointed - - meal - - bland - - end - take - couple bite - send - away - - - nt recommend go - palma - dinner - - - - - want - nice place - go - - glass - wine - - nice set - - great - - - 9 - ownermain cook - - falafel cart go - - bagel store - - street - insult - owner - pretend - speak loud broken korean - - - minute - - - time - turn - - friend - say - - - get away - - - - - - customer gon na - - beat - - - - - food - good - - - - boycotting - place - - - grandmother - life - - - exactly - happen -
# sum of counts
print("Group by all lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()
print("Get just the first row in each negative fake lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()
print("Top 10 words by part of speech used in real reviews:")
for pos in POS_TYPES:
dfPartofSpeech = dfWords[dfWords['pos'] == pos]
print()
print("POS_TYPE:", pos)
print(dfPartofSpeech.head(10).to_string())
Group by all lemma'd words, add count and sort: Get just the first row in each negative fake lemma'd group dfWords.head(10): lem index token stem pos counts 0 food 197 food food NN 1215 1 nt 9 nt nt NN 1189 2 place 2 place place NN 925 3 go 10 go go VB 847 4 get 44 get get VB 792 5 order 174 ordered order VB 668 6 good 227 good good JJ 615 7 come 163 came came VB 605 8 restaurant 66 restaurant restaur NN 575 9 service 268 service servic NN 570 Top 10 words by part of speech used in real reviews: POS_TYPE: NN lem index token stem pos counts 0 food 197 food food NN 1215 1 nt 9 nt nt NN 1189 2 place 2 place place NN 925 8 restaurant 66 restaurant restaur NN 575 9 service 268 service servic NN 570 10 time 529 time time NN 521 11 wait 74 wait wait NN 516 13 table 129 table tabl NN 495 26 friend 398 friends friend NN 315 27 minute 291 minutes minut NN 305 POS_TYPE: JJ lem index token stem pos counts 6 good 227 good good JJ 615 15 bad 139 bad bad JJ 408 30 great 327 great great JJ 269 42 much 76 much much JJ 215 60 many 460 many mani JJ 160 63 small 72 small small JJ 155 66 nice 236 nice nice JJ 152 70 last 121 last last JJ 145 71 new 712 new new JJ 144 78 next 908 next next JJ 133 POS_TYPE: VB lem index token stem pos counts 3 go 10 go go VB 847 4 get 44 get get VB 792 5 order 174 ordered order VB 668 7 come 163 came came VB 605 12 say 13 saying say VB 511 16 make 372 making make VB 405 18 ask 52 asked ask VB 382 21 eat 459 eaten eaten VB 362 23 take 494 taking take VB 358 24 give 27 gave gave VB 327 POS_TYPE: RB lem index token stem pos counts 14 back 286 back back RB 445 17 well 77 better better RB 399 19 even 132 even even RB 373 20 never 14 never never RB 364 22 really 393 really realli RB 358 43 first 111 first first RB 211 53 also 1032 also also RB 181 59 ever 415 ever ever RB 162 80 long 2806 long long RB 133 84 away 498 away away RB 128
# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("Negative FakeReview flatTokensList[:10]:", flatTokensList[:10])
print()
freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Negative FakeReview Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()
freqDist.plot(30, cumulative=False)
print()
#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Negative FakeReview Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
freqDist2.plot(30, cumulative=False)
Negative FakeReview flatTokensList[:10]: ['overrated', '', 'something', 'about', 'this', 'place', 'irks', 'me', '', 'was'] Negative FakeReview Frequency Distribution of all words[:30]: [('the', 8101), ('and', 4719), ('i', 4506), ('to', 4150), ('a', 3475), ('was', 3226), ('it', 2349), ('of', 2242), ('we', 1937), ('for', 1825), ('in', 1762), ('that', 1633), ('is', 1625), ('not', 1426), ('this', 1367), ('but', 1337), ('my', 1271), ('food', 1199), ('nt', 1189), ('they', 1109), ('with', 1073), ('were', 1072), ('you', 1019), ('at', 1009), ('on', 998), ('had', 953), ('have', 886), ('place', 813), ('s', 805), ('so', 791)]
Negative FakeReview Frequency Distribution of lemma[:30]: [('food', 1215), ('nt', 1189), ('place', 925), ('go', 847), ('get', 792), ('order', 668), ('good', 615), ('come', 605), ('restaurant', 575), ('service', 570), ('time', 521), ('wait', 516), ('say', 511), ('table', 495), ('back', 445), ('bad', 408), ('make', 405), ('well', 399), ('ask', 382), ('even', 373), ('never', 364), ('eat', 362), ('really', 358), ('take', 358), ('give', 327), ('want', 321), ('friend', 315), ('minute', 305), ('try', 302), ('people', 294)]
<AxesSubplot:xlabel='Samples', ylabel='Counts'>
# output frequency data to csv for further analysis in alteryx
dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistNegFakeRevLemma.csv", encoding = 'utf-8', index = False, header = False)