Classifying Medicine, Full Code
#--------------------------
library(jsonlite)
# download data from: http://www.yelp.com/dataset_challenge
# data assumed to be in directory 'yelp_dataset_challenge_academic_dataset',
# which is inside the current working directory
dirname <- "yelp_dataset_challenge_academic_dataset"
filestem <- "yelp_academic_dataset_"
filenamepart <- c("business", "checkin", "review", "tip", "user")
fileext <- ".json"
alldata <- list()
for (iter in 1:length(filenamepart)) {
filename = paste(dirname, "/", filestem, filenamepart[iter], fileext, sep = "")
dataframe = filenamepart[iter]
alldata[[iter]] <- fromJSON(sprintf("[%s]", paste(readLines(filename), collapse = ",")))
print(filename)
print(dataframe)
}
names(alldata) <- filenamepart
saveRDS(alldata, file = "revs_json_read.rds")
#--------------------------
alldata <- readRDS("revs_json_read.rds")
# identify rows of entries pertaining to doctors or hospitals, i.e., "mainstream" medicine
# including "Health" here does not change number of physicians or hospitals included; it
# does increase numbers of "Counseling & Mental Health", "Diagnostic Imaging", "Diagnostic
# Services", and "Optometrists". It's debatable whether they should be included.
medrows = unique(c(grep("Doctor", alldata$business$categories),
grep("Hospital", alldata$business$categories),
grep("Allergist", alldata$business$categories),
grep("Anesthesiologist", alldata$business$categories),
grep("Cardiologist", alldata$business$categories),
grep("Surgeon", alldata$business$categories),
grep("Dentist", alldata$business$categories),
grep("Drugstore", alldata$business$categories),
grep("Ear Nose & Throat", alldata$business$categories),
grep("Endodontist", alldata$business$categories),
grep("Internal Medicine", alldata$business$categories),
grep("Laser Eye Surgery/Lasik", alldata$business$categories),
grep("Obstetrician", alldata$business$categories),
grep("Gastroenterologist", alldata$business$categories),
grep("Gynecologist", alldata$business$categories),
grep("Ophthalmologist", alldata$business$categories),
grep("Oncologist", alldata$business$categories),
grep("Orthodontist", alldata$business$categories),
grep("Orthopedist", alldata$business$categories),
grep("Orthotic", alldata$business$categories),
grep("Pediatric", alldata$business$categories),
grep("Periodontist", alldata$business$categories),
grep("Pharmacy", alldata$business$categories),
grep("Podiatrist", alldata$business$categories),
grep("Psychiatrist", alldata$business$categories),
grep("Pulmonologist", alldata$business$categories),
grep("Radiologist", alldata$business$categories),
grep("Rheumatologist", alldata$business$categories),
grep("Urologist", alldata$business$categories),
grep("Medical Center", alldata$business$categories)))
# length(medrows)
# [1] 2632
# remove rows of "alternative" medicine
rmaltmedrows = unique(c(grep("Acupuncture", alldata$business$categories[medrows]),
grep("Massage", alldata$business$categories[medrows]),
grep("Naturopath", alldata$business$categories[medrows]),
grep("Psychic", alldata$business$categories[medrows]),
grep("Yoga", alldata$business$categories[medrows]),
grep("Spas", alldata$business$categories[medrows]),
grep("Food", alldata$business$categories[medrows]),
grep("Fitness", alldata$business$categories[medrows]),
grep("Osteopath", alldata$business$categories[medrows]),
grep("Chinese Medicine", alldata$business$categories[medrows]),
grep("Shopping", alldata$business$categories[medrows]),
grep("Chiropractor", alldata$business$categories[medrows]),
grep("Cannabis", alldata$business$categories[medrows]),
grep("Reflexology", alldata$business$categories[medrows]),
grep("Rolfing", alldata$business$categories[medrows]),
grep("Coach", alldata$business$categories[medrows]),
grep("Reiki", alldata$business$categories[medrows])))
medrows = medrows[-rmaltmedrows]
# length(rmaltmedrows)
# [1] 753
# length(medrows)
# [1] 1879
#--------------------------
# identify rows of entries clearly pertaining to "alternative" medicine
# includes only words that are clearly alternative medicine and excludes words that
# might be compatible with both mainstream and alternative medicine (e.g., "yoga"
# could be considered exercise and thus compatible with mainstream medicine).
altmedrows = unique(c(grep("Acupuncture", alldata$business$categories),
grep("Chiropractor", alldata$business$categories),
grep("Chinese Medicine", alldata$business$categories),
grep("Reflexology", alldata$business$categories),
grep("Reiki", alldata$business$categories),
grep("Osteopath", alldata$business$categories),
grep("Rolfing", alldata$business$categories),
grep("Naturopathic", alldata$business$categories)))
# length(altmedrows)
# [1] 378
# remove rows of "mainstream" medicine
rmmedrows = unique(c(grep("Dermatologists", alldata$business$categories[altmedrows]),
grep("Neurologist", alldata$business$categories[altmedrows]),
grep("Obstetrician", alldata$business$categories[altmedrows]),
grep("Gynecologist", alldata$business$categories[altmedrows]),
grep("Orthopedist", alldata$business$categories[altmedrows]),
grep("Allergist", alldata$business$categories[altmedrows]),
grep("Internal Medicine", alldata$business$categories[altmedrows])))
altmedrows = altmedrows[-rmmedrows]
# length(rmmedrows)
# [1] 8
# length(altmedrows)
# [1] 370
# no businesses were classified as both conventional and alternative medicine
intersect(medrows, altmedrows)
intersect(alldata$business$business_id[medrows], alldata$business$business_id[altmedrows])
# integer(0)
# character(0)
#--------------------------
# this section assesses what business categories are included in each group
# number of clearly mainstream medicine businesses (after eliminating businesses associated with alternative medicine words)
length(medrows)
# [1] 1879
# number of reviews of clearly mainstream medicine businesses
sum(alldata$business$review_count[medrows])
# [1] 14250
# business categories associated with clearly mainstream medicine businesses
table(unlist(alldata$business$categories[medrows]))
# mednames = as.data.frame(table(unlist(alldata$business$categories[medrows])))[ , 1]
# number of clearly alternative medicine businesses (after eliminating businesses associated with mainstream medicine words)
length(altmedrows)
# [1] 370
# number of reviews of clearly alternative medicine businesses
sum(alldata$business$review_count[altmedrows])
# [1] 3136
# business categories associated with clearly alternative medicine businesses
table(unlist(alldata$business$categories[altmedrows]))
#--------------------------
# get row indices for reviews of mainstream medicine businesses
medrevrows = NA
for (iter in 1:length(medrows)) {
medrevrows = c(medrevrows, which(alldata$review$business_id ==
alldata$business$business_id[medrows[iter]]))
}
medrevrows = medrevrows[-1]
# get row indices for reviews of alternative medicine businesses
altmedrevrows = NA
for (iter in 1:length(altmedrows)) {
altmedrevrows = c(altmedrevrows, which(alldata$review$business_id ==
alldata$business$business_id[altmedrows[iter]]))
}
altmedrevrows = altmedrevrows[-1]
length(medrevrows)
# [1] 12957
length(altmedrevrows)
# [1] 2883
# matching business IDs to get reviews (here) and summing business review counts (above)
# don't give same number of reviews: mainstream medicine - 14250 above vs. 12957 here;
# alternative medicine - 3136 above vs. 2883 here
# maybe Yelp review counts are inaccurate somehow?
# create data frame of only mainstream and alternative medicine reviews
medrevs = alldata$review[medrevrows, c(1:4, 6, 8)]
altmedrevs = alldata$review[altmedrevrows, c(1:4, 6, 8)]
medrevs[ , "medicine"] = "conventional"
altmedrevs[ , "medicine"] = "alternative"
revs = rbind(medrevs, altmedrevs)
saveRDS(revs, file = "revs.rds")
# number of establishments in 'revs' doesn't match number from 'alldata$business'
length(medrows) + length(altmedrows)
# [1] 2249
length(unique(revs$business_id))
# [1] 2226
(length(medrows) + length(altmedrows)) - length(unique(revs$business_id))
# [1] 23
# maybe those 23 businesses didn't have any reviews?
# first, verify business IDs are unique
length(alldata$business$business_id)
# [1] 61184
length(unique(alldata$business$business_id))
# [1] 61184
length(alldata$business$business_id[medrows])
# [1] 1879
length(unique(alldata$business$business_id[medrows]))
# [1] 1879
length(alldata$business$business_id[altmedrows])
# [1] 370
length(unique(alldata$business$business_id[altmedrows]))
# [1] 370
# Here's the answer: 23 of the medical establishments don't appear in the reviews,
# so they must not have any reviews
length(setdiff(alldata$business$business_id[medrows], alldata$review$business_id))
# [1] 21
length(setdiff(alldata$business$business_id[altmedrows], alldata$review$business_id))
# [1] 2
#--------------------------
revs <- readRDS("revs.rds")
# re-arrange data frame of reviews
revs <- cbind(revs$business_id, revs[ , 2:3], revs$medicine, revs$stars,
revs$votes, revs$text, stringsAsFactors = F)
colnames(revs) <- gsub("revs\\$", "", colnames(revs))
revs$medicine <- as.factor(revs$medicine)
revs$business_id <- as.factor(revs$business_id)
revs <- revs[order(revs$review_id), ]
#--------------------------
# remove problematic reviews from data frame
library(qdap)
# according to 'Encoding' and 'check_text' functions, 76 reviews are in UTF-8,
# which can't be read by 'word_stats' function; 'iconv' couldn't convert UTF-8
# to ASCII, so the 76 UTF-8 rows are being deleted
utfrows <- which(Encoding(revs$text[1:dim(revs)[1]]) == "UTF-8")
revs <- revs[-utfrows, ]
# remove texts not written in English
# create data frame with misspelling rate for each review
misspellnumbs <- table(check_spelling(revs$text, assume.first.correct = FALSE)$row)
misspells <- cbind(as.integer(rownames(misspellnumbs)),
as.integer(misspellnumbs))
colnames(misspells) <- c("textrownumber", "misspellingsnumber")
# 'missingrows': not returned by check_spelling function; indicates no misspellings
missingrows <- setdiff(1:dim(revs)[1], misspells[ , 1])
nomisspells <- cbind(missingrows, rep(0, length(missingrows))) # reviews with no misspellings
misspells <- as.data.frame(rbind(misspells, nomisspells)) # all reviews combined
misspells <- misspells[order(misspells[ , 1]), ]
rownames(misspells) <- 1:dim(misspells)[1]
misspells$totalwords <- word_count(revs$text)
misspells$errorrate <- misspells[ , 2] / misspells [ , 3] # misspellings per word
# plot(misspells[ , 4]) # shows several rows with high misspelling rate
# identify reviews with misspelling rates > 50%
hierrorrownumbers <- misspells[misspells[ , 4] > 0.5, 1]
hierrorrownumbers <- hierrorrownumbers[!is.na(hierrorrownumbers)]
# revs$text[hierrorrownumbers]
foreigntexts <- c(1, 3) # visual inspection shows that these reviews are not in English
# revs$text[hierrorrownumbers[foreigntexts]]
# revs[hierrorrownumbers[foreigntexts], ]
# these are the row indices of the foreign-language reviews to delete from 'revs'
nonenglish <- hierrorrownumbers[foreigntexts]
revs <- revs[-nonenglish, ]
# row 4364: error in 'formality' function; it says it needs a 2-D array; haven't solved
problemrows <- c(4364)
revs <- revs[-problemrows, ]
#--------------------------
# prepare text of reviews for analysis
revs$text = add_incomplete(revs$text)
revs$text = incomplete_replace(revs$text)
saveRDS(revs, file = "revsculledprepped.rds")
#--------------------------
# add analytical metrics/scores of reviews to reviews data frame
revs <- readRDS("revsculledprepped.rds")
# This function detects whether a sequence of a vector's elements repeats.
# If they do, it eliminates the duplicates/repetitions so that only one
# sequence remains.
# The sequence can extend over any number of vector elements. There can be
# any number of duplicates in the vector; all the duplicates will be eliminated.
# The limitation of this function is that it checks for only 1 duplication.
# If it finds 1 duplicate and the overall length of the vector is a multiple
# of the length of the sequence, then it assumes any additional elements
# following the first duplicate are also duplicated sequences.
removerepeats <- function(vectora) {
# vectors must have at least 2 elements to continue through the function
if (length(vectora) < 2) {return(vectora)}
# checks whether the vector's 1st element is duplicated
firstrepeat <- match(vectora[1], vectora[-1], nomatch = 1)
# if not, function exits
if (firstrepeat == 1) {
return(vectora)
# if so, 'firstrepeat' is the vector index of that duplicate
# thus, 'firstrepeat - 1' is the length of the possible duplication sequence
} else if (firstrepeat > 1) {
firstrepeat <- firstrepeat + 1
}
# vector length must be a multiple of the sequence length to continue
# through function
lengthremainder <- length(vectora) %% (firstrepeat - 1)
if (lengthremainder != 0) {
return(vectora)
}
# to be a duplicate, each element of the 2nd sequence must match the
# corresponding element of the 1st sequence
repeattest = rep(FALSE, (firstrepeat - 1))
for (iter in 1:(firstrepeat - 1)) {
repeattest[iter] <- (vectora[iter] == vectora[firstrepeat + iter - 1])
}
# if any element between the 2 sequences mismatched, exit function
if (FALSE %in% repeattest) {
return(vectora)
# otherwise, the 2nd sequence is a duplicate, and the vector is modified
# so that only the 1st sequence is retained
} else {
vectora <- vectora[1:(firstrepeat - 1)]
return(vectora)
}
}
# 'n.imper' and 'n.incom' might be in wrong order in 'wordstatscolnames' below
# 'p.imper' and 'p.incom' might be in wrong order in 'wordstatscolnames' below
wordstatscolnames = c(
# from word_stats function
"all", "n.sent", "n.words", "n.char", "n.syl", "n.poly",
"wps", "cps", "sps", "psps", "cpw", "spw", "pspw",
"n.state", "n.quest", "n.exclm", "n.imper", "n.incom",
"p.state", "p.quest", "p.exclm", "p.imper", "p.incom",
"n.hapax", "n.dis", "grow.rate", "prop.dis",
# from 'pronoun_type' function
"pronoun.word.count", "I", "we", "you", "he", "she",
"they", "it", "me", "us", "him", "her", "them", "their",
# from 'polarity' function
"total.sentences", "total.words", "ave.polarity",
"sd.polarity", "stan.mean.polarity",
# from 'automated_readability_index' function
"read.word.count", "sentence.count", "character.count",
"Automated_Readability_Index",
# from 'diversity' function
"wc", "simpson", "shannon", "collision",
"berger_parker", "brillouin",
# from 'lexical_classification' function
"lexical.word.count", "ave.content.rate", "SE", "n.content",
"n.functional", # "content", "functional",
# from 'formality' function
"formality.word.count", "formality")
# number of columns following 'prop.dis' (last element of 'word_stats' function)
wstatsendcol = which(wordstatscolnames == "prop.dis")
afterwstatscolnum = length(wordstatscolnames) - wstatsendcol
temparray = rep(NA, length(wordstatscolnames))
temparray = as.data.frame(t(temparray))
colnames(temparray) = wordstatscolnames
# starttime = proc.time()
for (iter in 1:dim(revs)[1]) {
otherstats = rep(NA, afterwstatscolnum) # initialize 'otherstats', which contains results from functions other than 'wordstats'
otherstats = as.data.frame(t(otherstats))
if (grepl("[a-zA-Z]", revs$text[iter]) == FALSE) { # if there is no text (i.e., no alphabetic characters) in the review
wordstats = rep(NA, length(wordstatscolnames)) # create 'wordstats' with 'NA's for all values
wordstats = as.data.frame(t(wordstats))
colnames(wordstats) = wordstatscolnames
} else { # else create 'wordstats' with 'word_stats' function calculations
tempsplit = sentSplit(revs[iter, ], "text")
adjrownum = length(removerepeats(tempsplit$text))
tempsplit = tempsplit[1:adjrownum, ]
otherstats[1:14] = pronoun_type(tempsplit$text)$prop[ , -1]
otherstats[15:19] = polarity(tempsplit$text)$group[ , -1]
otherstats[20:23] = automated_readability_index(tempsplit$text)$Readability[ , -1]
otherstats[24:29] = diversity(tempsplit$text)[ , -1]
# 'lexical_classification' and 'formality' produce errors if there's
# only one word of text in the review, so they are skipped in this case
if (otherstats[1] > 1) {
otherstats[30:34] = lexical_classification(tempsplit$text)$lexical_classification[ , 2:6]
dummy <- capture.output(otherstats[35:36] <- formality(tempsplit$text)$formality[ , -1]) # hacky way of preventing 'formality' from printing output
} else {
otherstats[30:36] = NA
}
wordstats = word_stats(tempsplit$text)$gts
if (colnames(wordstats)[dim(wordstats)[2]] != wordstatscolnames[wstatsendcol]) {
wordstats[dim(wordstats)[2] + 1] = NA
colnames(wordstats)[dim(wordstats)[2]] = wordstatscolnames[wstatsendcol]
}
# if the last column name of 'wordstats' is not "prop.dis" (i.e., the last element in 'wordstatscolnames'),
# the 'if' condition in the 'for' loop below will produce an error; the 'if' statement above avoids this
# if a column in 'wordstats' is not present, insert column with 'NA' as value
for (iter2 in 1:wstatsendcol) {
if (colnames(wordstats)[iter2] != wordstatscolnames[iter2]) {
wordstats = data.frame(wordstats[1:(iter2 - 1)], NA, wordstats[iter2:dim(wordstats)[2]])
}
}
}
wordstats[28:63] = otherstats
colnames(wordstats) = wordstatscolnames
wordstats$all = as.character(wordstats$all) # convert 'factor' to 'character' so 'review_id' can be added (below)
wordstats$all = revs$review_id[iter]
temparray = rbind.data.frame(temparray, wordstats)
if (iter %% 100 == 0) {print(iter)}
}
# proc.time() - starttime
saveRDS(temparray, file = "temparray.rds")
temparray <- readRDS("temparray.rds")
temparray = temparray[-1, ]
rownames(temparray) = seq(from = 1, to = dim(temparray)[1])
# table(revs$review_id == temparray$all) # checks that rows match between 'revs' and 'temparrayall'
revsall = cbind(revs, temparray)
saveRDS(revsall, file = "revsall.rds")
#--------------------------
# add word frequencies to reviews data frame
library(tm)
revsall <- readRDS("revsall.rds")
# takes a vector of type 'character' (i.e., strings) and returns a vector of
# numbers of the words/tokens contained within original vector
corpfreqprep <- function(vectoroftexts, removepunct = TRUE) {
corpus <- Corpus(VectorSource(vectoroftexts))
corpus <- tm_map(corpus, content_transformer(removeNumbers))
corpus <- tm_map(corpus, content_transformer(stripWhitespace))
if (removepunct) corpus <- tm_map(corpus, content_transformer(removePunctuation))
corpfreq <- colSums(as.matrix(DocumentTermMatrix(corpus)))
corpfreq <- sort(corpfreq, decreasing = TRUE)
return(corpfreq)
}
# frequencies of all words used across all reviews (not adjusted for length of reviews)
alltextsfreq <- corpfreqprep(revsall$text, removepunct = TRUE)
# remove words (mostly proper nouns) that might not generalize well to other data sets
# 331 vegas, 393 yelp, 543 phoenix, 561 las, 660 scottsdale, 753 groupon, 1047 arizona
# 1051 mayo, 1209 chandler, 1217 summerlin, 1331 john, 1333 nevada, 1385 miller
# 1426 gilbert, 1317 north, 1407 west
wordstoremove <- c(331, 393, 543, 561, 660, 753, 1047, 1051, 1209, 1217, 1317,
1331, 1333, 1385, 1407, 1426)
alltextsfreq <- alltextsfreq[-wordstoremove]
# selects words that appear at least 100 times across all reviews
# using 100 as cut-off is somewhat arbitrary
commonwordsnum <- match(99, alltextsfreq) - 1
commonwords <- names(alltextsfreq)[1:commonwordsnum]
# add columns to data frame for each common word's frequency in each review
revsallcolnum <- dim(revsall)[2]
revsall[(revsallcolnum + 1):(revsallcolnum + commonwordsnum)] <- NA
colnames(revsall)[(revsallcolnum + 1):(revsallcolnum + commonwordsnum)] <- commonwords
# for each row in 'revsall', i.e., for each review
for (iter in 1:dim(revsall)[1]) {
if (iter %% 100 == 0) {print(iter)}
tempfreq <- corpfreqprep(revsall$text[iter], removepunct = TRUE)
# if 'corpfreqprep' returns 'logical(0)' to 'tempfreq', assigning "" to 'tempfreq'
# avoid subsequent error
# this error occurs 8 times in the data set at these row indices in 'revsall':
# 1711, 1954, 6760, 8069, 8417, 8486, 9783, 10902
# the review texts at these row indices are: "Ok|", "|", ".", "|", "c|", "A|", "NT|", ":)|"
# these errors do not affect the totals for the most common words, so the
# correction below is adequate
if (length(tempfreq) == 0) {
tempfreq <- 0
names(tempfreq) <- ""
}
# for each word counted in a review
for (iter2 in 1:length(tempfreq)) {
# if the word counted in the review is in the common words
if (names(tempfreq)[iter2] %in% commonwords) {
wordcoltemp <- which(colnames(revsall)[(revsallcolnum + 1):dim(revsall)[2]] ==
names(tempfreq)[iter2])
wordcol <- wordcoltemp + revsallcolnum
# save the frequency of the word as a proportion of the number of words in the review
revsall[iter, wordcol] <- tempfreq[iter2] / revsall$n.words[iter]
}
}
}
saveRDS(revsall, file = "revsallwordfreqs.rds")
#--------------------------
# modify variables and remove unneeded variables
expdata <- readRDS("revsallwordfreqs.rds")
# checks word frequencies in 'expdata' against word frequencies in 'alltextsfreq'
# to make sure data were added to 'revsall'/'expdata' correctly
# startcol <- 73
# stopcol <- dim(expdata)[2]
# coltotals <- rep(NA, length(startcol:stopcol))
# nwordscolnum <- 12
# for (colnum in startcol:stopcol){
# rowwordtotal <- 0
# for (iter in 1:dim(expdata)[1]) {
# if (!is.na(expdata[iter, colnum])) {
# rowwordnum <- expdata[iter, colnum] * expdata[iter, nwordscolnum]
# rowwordtotal <- rowwordtotal + rowwordnum
# }
# }
# coltotals[colnum - (startcol - 1)] <- rowwordtotal
# }
# table(coltotals == alltextsfreq[1:commonwordsnum]) # all 1462 should be TRUE; should be no FALSEs
# 'ave.polarity' is the total polarity divided by the number of sentences, but
# the sentence number counts are unreliable. Therefore, 'ave.polarity' will be
# transformed to be polarity per word (instead of "polarity per sentence").
# plot(abs(revsall$n.words - revsall$total.words)/mean(c(revsall$n.words, revsall$total.words), na.rm=T))
expdata$ave.polarity = (expdata$ave.polarity * expdata$total.sentences) / expdata$total.words
# replace 'NA's with zeroes
expdata[is.na(expdata)] <- 0
# word counts for different functions don't completely agree
# columns of word counts: 12, 37, 52, 56, 60, 66
# cor(expdata[,c(12, 37, 52, 56, 60, 66)], use = "pairwise.complete.obs")
# correlation matrix and graphs show the agreement is very strong -- strong enough to delete extra columns
# table(revsall$n.words == revsall$total.words)
# delete: 37, 52, 56, 60, 66: columns of word counts
# delete 9, review text
# delete column 10, duplicate of 'review_id'
# delete 11 'n.sent': number of sentences is often incorrect
# delete 16:19: all 'per sentence' stats, which are often incorrect
# delete 28:32: proportions of all sentences, which are often incorrect
# delete 33:36: unsure how unique words would be useful; deleting
# delete 54:55: polarity standard deviation is by sentence count, which is often incorrect
# delete 56:59: readability relies on sentence counts, which are often incorrect; can I calculate something useful from this?
# delete 63, 65: shannon, collision, brillouin correlate very highly; keep shannon
# delete 66, 68:70: 'ave.content.rate' (col 67) is good (expressed as %); other cols unnecessary
# delete 71: 'formality' (col 72) is good (expressed as %); 71 is unnecessary
cullcols = c(9:11, 16:19, 28:32, 33:37, 51:52, 54:60, 63, 65:66, 68:71)
expdata = expdata[ , -cullcols]
saveRDS(expdata, file = "revsfeaturesculled.rds")
#--------------------------
# barplot of ratings by 'alternative' or 'conventional' medicine
expdata <- readRDS("revsfeaturesculled.rds")
png('ratings.png', width=640, height=512)
plotdata = expdata
starsbymed = cbind(as.data.frame(table(plotdata[plotdata$medicine == "conventional", ]$stars)),
as.data.frame(table(plotdata[plotdata$medicine == "alternative", ]$stars)))
starsbymed = starsbymed[ , -c(1, 3)]
starsmedprop = starsbymed
starsmedprop[ , 1] = starsmedprop[ , 1] / sum(starsmedprop[ , 1])
starsmedprop[ , 2] = starsmedprop[ , 2] / sum(starsmedprop[ , 2])
colnames(starsmedprop) = c("conventional", "alternative")
barplot(t(as.matrix(starsmedprop)), beside = T, ylim = c(0, 1), col = c(8, 9),
xlab = c("Number of stars"), ylab = c("Proportion of reviews"),
names.arg = c("1", "2", "3", "4", "5"),
main = "Yelp reviewer ratings of medical establishments")
legend(x = 1, y = 0.98, legend = colnames(starsmedprop), fill = c(8, 9))
abline(h = 0)
dev.off()
#--------------------------
# splitting data into training (60% of data) and testing (40%) sets, stratified
# by 'medicine' ('conventional' or 'alternative') and star ratings (1 - 5)
expdata <- readRDS("revsfeaturesculled.rds")
var1 <- levels(expdata$medicine)
var2 <- sort(unique(expdata$stars))
testrows <- NA
set.seed(43317)
for (iter1 in 1:length(var1)) {
for (iter2 in 1:length(var2)) {
set <- which(expdata$medicine == var1[iter1] & expdata$stars == var2[iter2])
sampledrows <- sample(set, size = round(0.4 * length(set)), replace = FALSE)
testrows <- append(testrows, sampledrows)
}
}
testrows <- testrows[-1]
traindata <- expdata[-testrows, ]
testdata <- expdata[testrows, ]
saveRDS(traindata, file = 'traindata.rds')
saveRDS(testdata, file = 'testdata.rds')
#--------------------------
# tree model on training set
traindata <- readRDS('traindata.rds')
testdata <- readRDS('testdata.rds')
# column indices for data to be included in analysis
# column 4 is the classification label: conventional or alternative medicine
# columns 5 - 39 are Yelp data and the standard analyses from the package 'qdap'
# the remaining columns are word usage rates
response_col <- 4
predictor_cols <- 5:dim(traindata)[2]
library(randomForest)
# when sampsize = # cases in class (n), and as 'n' grows larger, the number of
# unique samples in 'sampsize' converges toward 1 - 1/e (~63.2%)
sampsize <- min(table(traindata[ , response_col]))
n_trees <- 5000
print_interval <- ceiling(n_trees / 100)
# parameter search for 'mtry'
set.seed(22640171)
mtry_results <- tuneRF(y=traindata[ , response_col],
x=traindata[ , predictor_cols],
sampsize=c(sampsize, sampsize),
strata=traindata[ , response_col],
ntreeTry=n_trees,
doBest=F, importance=T, plot=F, do.trace=print_interval)
write.csv(mtry_results, 'mtry_oob_error.csv')
mtry_results <- read.csv('mtry_oob_error.csv', header=T, row.names=1)
mtry <- mtry_results[mtry_results[ , 2] == min(mtry_results[ , 2]), 1]
# train on all training data
set.seed(83875501)
rfmodel <- randomForest(y=traindata[ , response_col],
x=traindata[ , predictor_cols],
sampsize=c(sampsize, sampsize),
strata=traindata[ , response_col],
ntree=n_trees, mtry=mtry,
keep.inbag=T, keep.forest=T,
importance=T, do.trace=print_interval)
saveRDS(rfmodel, file='random_forest_model.rds')
#--------------------------
# evaluate model
library(randomForest)
library(caret)
library(ROCR)
response_col <- 4
traindata <- readRDS('traindata.rds')
testdata <- readRDS('testdata.rds')
rfmodel <- readRDS('random_forest_model.rds')
get_confusion_elements <- function(caret_confusion_matrix) {
tp <- as.numeric(caret_confusion_matrix$table[1]) # true positives
fn <- as.numeric(caret_confusion_matrix$table[2]) # false negatives
fp <- as.numeric(caret_confusion_matrix$table[3]) # false positives
tn <- as.numeric(caret_confusion_matrix$table[4]) # true negatives
return( c(tp, fp, tn, fn) )
}
calculate_mcc <- function(tp, fp, tn, fn) {
# calculates Matthews correlation coefficient
# tp - true positives
# fp - false positives
# tn - true negatives
# fn - false negatives
mcc <- ((tp * tn) - (fp * fn)) /
sqrt( (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) )
return(mcc)
}
get_mcc <- function(caret_confusion_matrix) {
# calculates Matthews correlation coefficient from confusion matrix
ce <- get_confusion_elements(caret_confusion_matrix)
mcc <-calculate_mcc(ce[1], ce[2], ce[3], ce[4])
return(mcc)
}
calculate_auc_one_classifier<- function(recall_sensitivity_tpr,
specificity_tnr) {
a <- recall_sensitivity_tpr
b <- 1 - specificity_tnr
# a*b/2 + a*(1-b) + (1-a)*(1-b)/2 reduces to:
auc <- a/2 - b/2 + 1/2
return(auc)
}
calculate_f1_score <- function(precision, recall, beta=1) {
# calculates F1 score from precision, recall, and beta
f1_score <- unname( ( (1 + beta^2) * recall * precision ) /
( (recall + precision) * beta^2) )
return(f1_score)
}
get_f1_score <- function(caret_confusion_matrix) {
# calculates F1 score from confusion matrix
sensitivity_recall <- caret_confusion_matrix$byClass[1]
ppv_precision <- caret_confusion_matrix$byClass[3]
f1_score <- calculate_f1_score(ppv_precision, sensitivity_recall)
return(f1_score)
}
calculate_informedness <- function(sensitivity_recall_tpr, specificity_tnr) {
# for dichotomous outcomes, informedness is also known as Youden's J and
# as Delta P'
informedness <- sensitivity_recall_tpr + specificity_tnr - 1
return(informedness)
}
get_informedness <- function(caret_confusion_matrix) {
# calculates informedness from confusion matrix
sensitivity_recall_tpr <- caret_confusion_matrix$byClass[1]
specificity_tnr <- caret_confusion_matrix$byClass[2]
inform <- unname(calculate_informedness(sensitivity_recall_tpr,
specificity_tnr))
return(inform)
}
calculate_markedness <- function(ppv_precision, npv) {
# for dichotomous outcomes, markedness is also known as Delta P
markedness <- ppv_precision + npv - 1
return(markedness)
}
get_markedness <- function(caret_confusion_matrix) {
# calculates markedness from confusion matrix
ppv_precision <- caret_confusion_matrix$byClass[3]
npv <- caret_confusion_matrix$byClass[4]
markedness <- unname(calculate_markedness(ppv_precision, npv))
return(markedness)
}
get_metrics <- function(confusion_matrix) {
inform <- get_informedness(confusion_matrix)
mark <- get_markedness(confusion_matrix)
mcc <- get_mcc(confusion_matrix)
f1 <- get_f1_score(confusion_matrix)
return( c(inform, mark, mcc, f1) )
}
# confusionMatrix(rfmodel$predicted, traindata[ , response_col])
# random forest OOB classification
rf_oob_classify <- rfmodel$predicted
# random forest classification
rf_classify <- predict(rfmodel, testdata)
class_table <- table(traindata[ , response_col])
class_table_prop <- class_table / sum(class_table)
# random classification, 50%-50%
set.seed(12247081)
random_classify_5050 <- sample(names(class_table), dim(testdata)[1], replace=T)
# random classification, imbalanced (skewed)
set.seed(32571012)
random_classify_skew <- sample(names(class_table), dim(testdata)[1], replace=T,
prob=class_table_prop)
# uniform majority classification
max_class_n <- max(class_table)
max_class_name <- names(which(class_table == max_class_n))
uniform_classify <- rep(max_class_name, dim(testdata)[1])
# uniform majority classification, except for one case
min_class_n <- min(class_table)
min_class_name <- names(which(class_table == min_class_n))
almost_uniform_classify <- uniform_classify
almost_uniform_classify[1] <- min_class_name
classify_list <- list(rf_oob_classify, rf_classify, random_classify_5050,
random_classify_skew, uniform_classify,
almost_uniform_classify)
confusion_matrices <- vector('list', length(classify_list))
col_names <- c('sensitivity', 'specificity', 'inform', 'mark', 'mcc', 'f1',
'auc')
metrics <- data.frame(matrix(nrow=length(classify_list),
ncol=length(col_names)))
rownames(metrics) <- c('rf_oob', 'rf', 'random_50-50', 'random_skew',
'uniform', 'almost_uniform')
colnames(metrics) <- col_names
for (i in 1:length(classify_list)) {
if (i == 1) {
confusion_matrices[[i]] <- confusionMatrix(classify_list[[i]],
traindata[ , response_col])
} else {
confusion_matrices[[i]] <- confusionMatrix(classify_list[[i]],
testdata[ , response_col])
}
metrics[i, 1] <- confusion_matrices[[i]]$byClass[1]
metrics[i, 2] <- confusion_matrices[[i]]$byClass[2]
metrics[i, 3:(dim(metrics)[2]-1)] <- get_metrics(confusion_matrices[[i]])
metrics[i, dim(metrics)[2]] <- calculate_auc_one_classifier(metrics[i, 1],
metrics[i, 2])
}
rf_classify_prob <- predict(rfmodel, testdata, type='prob')
rf_test_pred <- prediction(rf_classify_prob[ , 1], testdata[ , response_col],
c(max_class_name, min_class_name))
rf_test_roc <- performance(rf_test_pred, 'tpr', 'fpr')
png('rf_roc_auc.png', width=640, height=640)
plot(rf_test_roc, col=2)
points(y=metrics$sensitivity[-1], x=1-metrics$specificity[-1], pch=16)
points(y=metrics[2, 1], x=1-metrics[2, 2], pch=16, col=2)
abline(0, 1, lty=2)
lines(y=c(0, metrics$sensitivity[2]), x=c(0, 1-metrics$specificity[2]), col=2)
lines(y=c(1, metrics$sensitivity[2]), x=c(1, 1-metrics$specificity[2]), col=2)
dev.off()
# add OOB to plot
# rf_oob_pred <- prediction(rfmodel$votes[ , 1], rfmodel$y,
# c(max_class_name, min_class_name))
# rf_oob_roc <- performance(rf_oob_pred, 'tpr', 'fpr')
# points(y=metrics[1, 1], x=1-metrics[1, 2], pch=16, col=3)
# plot(rf_oob_roc, add=T, col=3)
# AUC
performance(rf_test_pred, 'auc')@y.values[[1]]
# verifying on plot that random forest model defaulted to 0.50 voting threshold
# a = cbind(rf_test_roc@x.values[[1]],
# rf_test_roc@y.values[[1]],
# rf_test_roc@alpha.values[[1]])
# plot(rf_test_roc, col=2, ylim=c(0.6, 1), xlim=c(0, 0.15))
# points(y=metrics[2, 1], x=1-metrics[2, 2], pch=16, col=2)
# points(y=a[934, 2], x=a[934, 1])
# dots overlap, so yes, the voting threshold was 0.50
get_y_intercept <- function(x, y, slope) {
return(y - (slope * x))
}
slope <- max_class_n / min_class_n
intercept <- get_y_intercept(1 - metrics[2, 2], metrics[2, 1], slope)
png('rf_roc_isocost.png', width=640, height=512)
plot(rf_test_roc, col=2)
points(y=metrics$sensitivity[-1], x=1-metrics$specificity[-1], pch=16)
points(y=metrics[2, 1], x=1-metrics[2, 2], pch=16, col=2)
abline(0, 1, lty=2)
abline(intercept, slope)
dev.off()
rf_test_mcc <- performance(rf_test_pred, 'mat')
mcc_max <- max(rf_test_mcc@y.values[[1]], na.rm=T)
mcc_max_idx <- which(rf_test_mcc@y.values[[1]] == mcc_max)
mcc_max_threshold <- rf_test_mcc@x.values[[1]][mcc_max_idx]
png('rf_mcc_threshold.png', width=640, height=512)
plot(rf_test_mcc)
abline(v=mcc_max_threshold, lty=2)
dev.off()
thresholds <- rf_test_roc@alpha.values[[1]]
# min(abs(thresholds - mcc_max_threshold))
# plot( abs(thresholds - mcc_max_threshold) ~ c(1:2812), xlim=c(810, 820), ylim=c(0, 0.05))
thresholds_idx <- which(thresholds == mcc_max_threshold)
roc_y <- rf_test_roc@y.values[[1]][thresholds_idx]
roc_x <- rf_test_roc@x.values[[1]][thresholds_idx]
png('rf_roc_max_mcc.png', width=640, height=512)
plot(rf_test_roc, col=2, ylim=c(0.6, 1), xlim=c(0, 0.10))
points(y=metrics[2, 1], x=1-metrics[2, 2], pch=16, col=2, cex=2)
abline(intercept, slope)
points(y=roc_y, x=roc_x, pch=16, col=3, cex=2)
dev.off()
rf_predict_optimal_threshold <- as.factor(rf_classify_prob[ , 1] < mcc_max_threshold)
levels(rf_predict_optimal_threshold) <- c(min_class_name, max_class_name)
rf_confusion_optimal_threshold <- confusionMatrix(rf_predict_optimal_threshold,
testdata[ , response_col])
rf_optimized_metrics <- metrics[2:3, 1:dim(metrics)[2]-1]
rownames(rf_optimized_metrics)[2] <- 'rf_optimized'
rf_optimized_metrics[2, 1] <- rf_confusion_optimal_threshold$byClass[1]
rf_optimized_metrics[2, 2] <- rf_confusion_optimal_threshold$byClass[2]
rf_optimized_metrics[2, 3:6] <- get_metrics(rf_confusion_optimal_threshold)
# plots informedness on ROC plot
# points(y=rf_optimized_metrics$inform[1], x=1-rf_optimized_metrics$specificity[1], col=2)
# points(y=rf_optimized_metrics$inform[2], x=1-rf_optimized_metrics$specificity[2], col=3)
rf_test_prc <- performance(rf_test_pred, 'prec', 'rec')
png('rf_prc_max_mcc.png', width=640, height=512)
plot(rf_test_prc, col=2, ylim=c(0, 1))
# plot(rf_test_prc, col=2, xlim=c(0.75, 0.9), ylim=c(0.75, 0.9))
min_prevalence <- min(class_table_prop)
abline(h=min_prevalence, lty=2)
points(y=confusion_matrices[[2]]$byClass[3],
x=confusion_matrices[[2]]$byClass[1],
col=2, pch=16)
points(y=rf_confusion_optimal_threshold$byClass[3],
x=rf_confusion_optimal_threshold$byClass[1],
col=3, pch=16)
points(y=rf_optimized_metrics$mark[1], rf_optimized_metrics$inform[1], col=2)
points(y=rf_optimized_metrics$mark[2], rf_optimized_metrics$inform[2], col=3)
dev.off()
sink(file = "rf_output.txt")
rfmodel
rf_confusion_optimal_threshold
metrics
rf_optimized_metrics
sink(file = NULL)
png('rf_variable_importance.png', width=640, height=640)
varImpPlot(rfmodel, n.var=30, main='Variable Importance', pch=16, cex=1)
dev.off()
png('rf_oob_error_by_tree.png', width=640, height=512)
plot(rfmodel)
dev.off()
var_imp <- importance(rfmodel)
var_imp <- var_imp[order(var_imp[ , 4], decreasing=T), ]
library(dplyr)
expdata <- readRDS('revsfeaturesculled.rds')
traindata <- readRDS('traindata.rds')
con_alt_means <- expdata[ , -c(1:3)] %>%
group_by(medicine) %>%
summarise_each(funs(mean))
con_alt_means <- as.data.frame(t(con_alt_means[ , -1]))
colnames(con_alt_means) <- c('alternative', 'conventional')
con_alt_means$alt_con_ratio <- (con_alt_means$alternative /
con_alt_means$conventional)
con_alt_means$con_alt_ratio <- (con_alt_means$conventional /
con_alt_means$alternative)
write.table(con_alt_means, 'con_alt_means.txt', sep='\t\t\t', row.names=T)
con_alt_means_train <- traindata[ , -c(1:3)] %>%
group_by(medicine) %>%
summarise_each(funs(mean))
con_alt_means_train <- as.data.frame(t(con_alt_means_train[ , -1]))
colnames(con_alt_means_train) <- c('alternative', 'conventional')
con_alt_means_train$alt_con_ratio <- (con_alt_means_train$alternative /
con_alt_means_train$conventional)
con_alt_means_train$con_alt_ratio <- (con_alt_means_train$conventional /
con_alt_means_train$alternative)
write.table(con_alt_means_train, 'con_alt_means_train.txt', sep='\t\t\t',
row.names=T)
// add bootstrap table styles to pandoc tables function bootstrapStylePandocTables() { $('tr.header').parent('thead').parent('table').addClass('table table-condensed'); } $(document).ready(function () { bootstrapStylePandocTables(); });