Classifying Medicine, Full Code

#--------------------------
library(jsonlite)

# download data from:  http://www.yelp.com/dataset_challenge
# data assumed to be in directory 'yelp_dataset_challenge_academic_dataset',
# which is inside the current working directory

dirname <- "yelp_dataset_challenge_academic_dataset"
filestem <- "yelp_academic_dataset_"
filenamepart <- c("business", "checkin", "review", "tip", "user")
fileext <- ".json"
alldata <- list()

for (iter in 1:length(filenamepart)) {
     filename = paste(dirname, "/", filestem, filenamepart[iter], fileext, sep = "")
     dataframe = filenamepart[iter]
     alldata[[iter]] <- fromJSON(sprintf("[%s]", paste(readLines(filename), collapse = ",")))
     print(filename)
     print(dataframe)
}
     
names(alldata) <- filenamepart

saveRDS(alldata, file = "revs_json_read.rds")

#--------------------------
alldata <- readRDS("revs_json_read.rds")

# identify rows of entries pertaining to doctors or hospitals, i.e., "mainstream" medicine
# including "Health" here does not change number of physicians or hospitals included; it
# does increase numbers of "Counseling & Mental Health", "Diagnostic Imaging", "Diagnostic
# Services", and "Optometrists".  It's debatable whether they should be included.
medrows = unique(c(grep("Doctor", alldata$business$categories), 
                   grep("Hospital", alldata$business$categories),
                   grep("Allergist", alldata$business$categories),
                   grep("Anesthesiologist", alldata$business$categories),
                   grep("Cardiologist", alldata$business$categories),
                   grep("Surgeon", alldata$business$categories),
                   grep("Dentist", alldata$business$categories),
                   grep("Drugstore", alldata$business$categories),
                   grep("Ear Nose & Throat", alldata$business$categories),
                   grep("Endodontist", alldata$business$categories),
                   grep("Internal Medicine", alldata$business$categories),
                   grep("Laser Eye Surgery/Lasik", alldata$business$categories),
                   grep("Obstetrician", alldata$business$categories),
                   grep("Gastroenterologist", alldata$business$categories),
                   grep("Gynecologist", alldata$business$categories),
                   grep("Ophthalmologist", alldata$business$categories),
                   grep("Oncologist", alldata$business$categories),
                   grep("Orthodontist", alldata$business$categories),
                   grep("Orthopedist", alldata$business$categories),
                   grep("Orthotic", alldata$business$categories),
                   grep("Pediatric", alldata$business$categories),
                   grep("Periodontist", alldata$business$categories),
                   grep("Pharmacy", alldata$business$categories),
                   grep("Podiatrist", alldata$business$categories),
                   grep("Psychiatrist", alldata$business$categories),
                   grep("Pulmonologist", alldata$business$categories),
                   grep("Radiologist", alldata$business$categories),
                   grep("Rheumatologist", alldata$business$categories),
                   grep("Urologist", alldata$business$categories),
                   grep("Medical Center", alldata$business$categories)))
# length(medrows)
# [1] 2632

# remove rows of "alternative" medicine
rmaltmedrows = unique(c(grep("Acupuncture", alldata$business$categories[medrows]),
                        grep("Massage", alldata$business$categories[medrows]),
                        grep("Naturopath", alldata$business$categories[medrows]),
                        grep("Psychic", alldata$business$categories[medrows]),
                        grep("Yoga", alldata$business$categories[medrows]),
                        grep("Spas", alldata$business$categories[medrows]),
                        grep("Food", alldata$business$categories[medrows]),
                        grep("Fitness", alldata$business$categories[medrows]),
                        grep("Osteopath", alldata$business$categories[medrows]),
                        grep("Chinese Medicine", alldata$business$categories[medrows]),
                        grep("Shopping", alldata$business$categories[medrows]),
                        grep("Chiropractor", alldata$business$categories[medrows]),
                        grep("Cannabis", alldata$business$categories[medrows]),
                        grep("Reflexology", alldata$business$categories[medrows]),
                        grep("Rolfing", alldata$business$categories[medrows]),
                        grep("Coach", alldata$business$categories[medrows]),
                        grep("Reiki", alldata$business$categories[medrows])))
medrows = medrows[-rmaltmedrows]
# length(rmaltmedrows)
# [1] 753
# length(medrows)
# [1] 1879

#--------------------------
# identify rows of entries clearly pertaining to "alternative" medicine
# includes only words that are clearly alternative medicine and excludes words that
# might be compatible with both mainstream and alternative medicine (e.g., "yoga" 
# could be considered exercise and thus compatible with mainstream medicine).
altmedrows = unique(c(grep("Acupuncture", alldata$business$categories), 
                      grep("Chiropractor", alldata$business$categories),
                      grep("Chinese Medicine", alldata$business$categories),
                      grep("Reflexology", alldata$business$categories),
                      grep("Reiki", alldata$business$categories),
                      grep("Osteopath", alldata$business$categories),
                      grep("Rolfing", alldata$business$categories),
                      grep("Naturopathic", alldata$business$categories)))
# length(altmedrows)
# [1] 378


# remove rows of "mainstream" medicine
rmmedrows = unique(c(grep("Dermatologists", alldata$business$categories[altmedrows]),
                     grep("Neurologist", alldata$business$categories[altmedrows]),
                     grep("Obstetrician", alldata$business$categories[altmedrows]),
                     grep("Gynecologist", alldata$business$categories[altmedrows]),
                     grep("Orthopedist", alldata$business$categories[altmedrows]),
                     grep("Allergist", alldata$business$categories[altmedrows]),
                     grep("Internal Medicine", alldata$business$categories[altmedrows])))
altmedrows = altmedrows[-rmmedrows]
# length(rmmedrows)
# [1] 8
# length(altmedrows)
# [1] 370

# no businesses were classified as both conventional and alternative medicine
intersect(medrows, altmedrows)
intersect(alldata$business$business_id[medrows], alldata$business$business_id[altmedrows])
# integer(0)
# character(0)

#--------------------------
# this section assesses what business categories are included in each group

# number of clearly mainstream medicine businesses (after eliminating businesses associated with alternative medicine words)
length(medrows)
# [1] 1879
# number of reviews of clearly mainstream medicine businesses
sum(alldata$business$review_count[medrows])
# [1] 14250
# business categories associated with clearly mainstream medicine businesses
table(unlist(alldata$business$categories[medrows]))
# mednames = as.data.frame(table(unlist(alldata$business$categories[medrows])))[ , 1]

# number of clearly alternative medicine businesses (after eliminating businesses associated with mainstream medicine words)
length(altmedrows)
# [1] 370
# number of reviews of clearly alternative medicine businesses
sum(alldata$business$review_count[altmedrows])
# [1] 3136
# business categories associated with clearly alternative medicine businesses
table(unlist(alldata$business$categories[altmedrows]))


#--------------------------
# get row indices for reviews of mainstream medicine businesses
medrevrows = NA
for (iter in 1:length(medrows)) {
     medrevrows = c(medrevrows, which(alldata$review$business_id ==
                                      alldata$business$business_id[medrows[iter]]))
}
medrevrows = medrevrows[-1]

# get row indices for reviews of alternative medicine businesses
altmedrevrows = NA
for (iter in 1:length(altmedrows)) {
     altmedrevrows = c(altmedrevrows, which(alldata$review$business_id ==
                                            alldata$business$business_id[altmedrows[iter]]))
}
altmedrevrows = altmedrevrows[-1]

length(medrevrows)
# [1] 12957
length(altmedrevrows)
# [1] 2883
# matching business IDs to get reviews (here) and summing business review counts (above)
# don't give same number of reviews:  mainstream medicine - 14250 above vs. 12957 here;
# alternative medicine - 3136 above vs. 2883 here
# maybe Yelp review counts are inaccurate somehow?

# create data frame of only mainstream and alternative medicine reviews
medrevs = alldata$review[medrevrows, c(1:4, 6, 8)]
altmedrevs = alldata$review[altmedrevrows, c(1:4, 6, 8)]

medrevs[ , "medicine"] = "conventional"
altmedrevs[ , "medicine"] = "alternative"

revs = rbind(medrevs, altmedrevs)
saveRDS(revs, file = "revs.rds")


# number of establishments in 'revs' doesn't match number from 'alldata$business'
length(medrows) + length(altmedrows)
# [1] 2249
length(unique(revs$business_id))
# [1] 2226
(length(medrows) + length(altmedrows)) - length(unique(revs$business_id))
# [1] 23
# maybe those 23 businesses didn't have any reviews?

# first, verify business IDs are unique
length(alldata$business$business_id)
# [1] 61184
length(unique(alldata$business$business_id))
# [1] 61184
length(alldata$business$business_id[medrows])
# [1] 1879
length(unique(alldata$business$business_id[medrows]))
# [1] 1879
length(alldata$business$business_id[altmedrows])
# [1] 370
length(unique(alldata$business$business_id[altmedrows]))
# [1] 370

# Here's the answer:  23 of the medical establishments don't appear in the reviews,
#    so they must not have any reviews
length(setdiff(alldata$business$business_id[medrows], alldata$review$business_id))
# [1] 21
length(setdiff(alldata$business$business_id[altmedrows], alldata$review$business_id))
# [1] 2


#--------------------------
revs <- readRDS("revs.rds")
# re-arrange data frame of reviews
revs <- cbind(revs$business_id, revs[ , 2:3], revs$medicine, revs$stars,
              revs$votes, revs$text, stringsAsFactors = F)
colnames(revs) <- gsub("revs\\$", "", colnames(revs))
revs$medicine <- as.factor(revs$medicine)
revs$business_id <- as.factor(revs$business_id)
revs <- revs[order(revs$review_id), ]


#--------------------------
# remove problematic reviews from data frame
library(qdap)

# according to 'Encoding' and 'check_text' functions, 76 reviews are in UTF-8,
# which can't be read by 'word_stats' function; 'iconv' couldn't convert UTF-8
# to ASCII, so the 76 UTF-8 rows are being deleted
utfrows <- which(Encoding(revs$text[1:dim(revs)[1]]) == "UTF-8")
revs <- revs[-utfrows, ]

# remove texts not written in English
# create data frame with misspelling rate for each review
misspellnumbs <- table(check_spelling(revs$text, assume.first.correct = FALSE)$row)
misspells <- cbind(as.integer(rownames(misspellnumbs)),
                   as.integer(misspellnumbs))
colnames(misspells) <- c("textrownumber", "misspellingsnumber")
# 'missingrows': not returned by check_spelling function; indicates no misspellings
missingrows <- setdiff(1:dim(revs)[1], misspells[ , 1])
nomisspells <- cbind(missingrows, rep(0, length(missingrows)))   # reviews with no misspellings
misspells <- as.data.frame(rbind(misspells, nomisspells))        # all reviews combined  
misspells <- misspells[order(misspells[ , 1]), ]  
rownames(misspells) <- 1:dim(misspells)[1]
misspells$totalwords <- word_count(revs$text)
misspells$errorrate <- misspells[ , 2] / misspells [ , 3]        # misspellings per word
# plot(misspells[ , 4])    # shows several rows with high misspelling rate
# identify reviews with misspelling rates > 50%
hierrorrownumbers <- misspells[misspells[ , 4] > 0.5, 1]
hierrorrownumbers <- hierrorrownumbers[!is.na(hierrorrownumbers)]
# revs$text[hierrorrownumbers]
foreigntexts <- c(1, 3)  # visual inspection shows that these reviews are not in English
# revs$text[hierrorrownumbers[foreigntexts]]
# revs[hierrorrownumbers[foreigntexts], ]
# these are the row indices of the foreign-language reviews to delete from 'revs'
nonenglish <- hierrorrownumbers[foreigntexts]
revs <- revs[-nonenglish, ]

# row 4364:  error in 'formality' function; it says it needs a 2-D array; haven't solved
problemrows <- c(4364)
revs <- revs[-problemrows, ]


#--------------------------
# prepare text of reviews for analysis

revs$text = add_incomplete(revs$text)
revs$text = incomplete_replace(revs$text)


saveRDS(revs, file = "revsculledprepped.rds")

#--------------------------
# add analytical metrics/scores of reviews to reviews data frame

revs <- readRDS("revsculledprepped.rds")

# This function detects whether a sequence of a vector's elements repeats.  
# If they do, it eliminates the duplicates/repetitions so that only one 
# sequence remains.

# The sequence can extend over any number of vector elements.  There can be
# any number of duplicates in the vector; all the duplicates will be eliminated.

# The limitation of this function is that it checks for only 1 duplication.
# If it finds 1 duplicate and the overall length of the vector is a multiple
# of the length of the sequence, then it assumes any additional elements 
# following the first duplicate are also duplicated sequences.

removerepeats <- function(vectora) {
     # vectors must have at least 2 elements to continue through the function
     if (length(vectora) < 2) {return(vectora)}
     
     # checks whether the vector's 1st element is duplicated
     firstrepeat <- match(vectora[1], vectora[-1], nomatch = 1)
     # if not, function exits
     if (firstrepeat == 1) {
          return(vectora)
          # if so, 'firstrepeat' is the vector index of that duplicate
          # thus, 'firstrepeat - 1' is the length of the possible duplication sequence
     } else if (firstrepeat > 1) {
          firstrepeat <- firstrepeat + 1
     }
     
     # vector length must be a multiple of the sequence length to continue
     # through function
     lengthremainder <- length(vectora) %% (firstrepeat - 1)
     if (lengthremainder != 0) {
          return(vectora)
     } 
     
     # to be a duplicate, each element of the 2nd sequence must match the 
     # corresponding element of the 1st sequence
     repeattest = rep(FALSE, (firstrepeat - 1))
     for (iter in 1:(firstrepeat - 1)) {
          repeattest[iter] <- (vectora[iter] == vectora[firstrepeat + iter - 1])
     }
     
     # if any element between the 2 sequences mismatched, exit function
     if (FALSE %in% repeattest) {
          return(vectora)
          # otherwise, the 2nd sequence is a duplicate, and the vector is modified
          # so that only the 1st sequence is retained
     } else {
          vectora <- vectora[1:(firstrepeat - 1)]
          return(vectora)
     }
     
}

# 'n.imper' and 'n.incom' might be in wrong order in 'wordstatscolnames' below
# 'p.imper' and 'p.incom' might be in wrong order in 'wordstatscolnames' below
wordstatscolnames = c(
     # from word_stats function
     "all", "n.sent", "n.words", "n.char", "n.syl", "n.poly", 
     "wps", "cps", "sps", "psps", "cpw", "spw", "pspw", 
     "n.state", "n.quest", "n.exclm", "n.imper", "n.incom", 
     "p.state", "p.quest", "p.exclm", "p.imper", "p.incom", 
     "n.hapax", "n.dis", "grow.rate", "prop.dis",
     # from 'pronoun_type' function
     "pronoun.word.count", "I", "we", "you", "he", "she", 
     "they", "it", "me", "us", "him", "her", "them", "their",
     # from 'polarity' function
     "total.sentences", "total.words", "ave.polarity",
     "sd.polarity", "stan.mean.polarity",
     # from 'automated_readability_index' function
     "read.word.count", "sentence.count", "character.count",
     "Automated_Readability_Index",
     # from 'diversity' function
     "wc", "simpson", "shannon", "collision", 
     "berger_parker", "brillouin",
     # from 'lexical_classification' function
     "lexical.word.count", "ave.content.rate", "SE", "n.content",
     "n.functional", # "content", "functional",
     # from 'formality' function
     "formality.word.count", "formality") 
# number of columns following 'prop.dis' (last element of 'word_stats' function)
wstatsendcol = which(wordstatscolnames == "prop.dis")
afterwstatscolnum = length(wordstatscolnames) - wstatsendcol

temparray = rep(NA, length(wordstatscolnames))
temparray = as.data.frame(t(temparray))
colnames(temparray) = wordstatscolnames

# starttime = proc.time()
for (iter in 1:dim(revs)[1]) {
     otherstats = rep(NA, afterwstatscolnum)         # initialize 'otherstats', which contains results from functions other than 'wordstats'
     otherstats = as.data.frame(t(otherstats))
     if (grepl("[a-zA-Z]", revs$text[iter]) == FALSE) {          # if there is no text (i.e., no alphabetic characters) in the review
          wordstats = rep(NA, length(wordstatscolnames))         # create 'wordstats' with 'NA's for all values
          wordstats = as.data.frame(t(wordstats))
          colnames(wordstats) = wordstatscolnames
     } else {                                                    # else create 'wordstats' with 'word_stats' function calculations
          tempsplit = sentSplit(revs[iter, ], "text")
          adjrownum = length(removerepeats(tempsplit$text))
          tempsplit = tempsplit[1:adjrownum, ]
          otherstats[1:14] = pronoun_type(tempsplit$text)$prop[ , -1]
          otherstats[15:19] = polarity(tempsplit$text)$group[ , -1]
          otherstats[20:23] = automated_readability_index(tempsplit$text)$Readability[ , -1]
          otherstats[24:29] = diversity(tempsplit$text)[ , -1]
          # 'lexical_classification' and 'formality' produce errors if there's
          # only one word of text in the review, so they are skipped in this case
          if (otherstats[1] > 1) {
               otherstats[30:34] = lexical_classification(tempsplit$text)$lexical_classification[ , 2:6]
               dummy <- capture.output(otherstats[35:36] <- formality(tempsplit$text)$formality[ , -1])   # hacky way of preventing 'formality' from printing output
          } else {
               otherstats[30:36] = NA
          }
          wordstats = word_stats(tempsplit$text)$gts
          
          if (colnames(wordstats)[dim(wordstats)[2]] != wordstatscolnames[wstatsendcol]) {  
               wordstats[dim(wordstats)[2] + 1] = NA
               colnames(wordstats)[dim(wordstats)[2]] = wordstatscolnames[wstatsendcol]
          }
          # if the last column name of 'wordstats' is not "prop.dis" (i.e., the last element in 'wordstatscolnames'),
          # the 'if' condition in the 'for' loop below will produce an error; the 'if' statement above avoids this
          
          # if a column in 'wordstats' is not present, insert column with 'NA' as value
          for (iter2 in 1:wstatsendcol) {
               if (colnames(wordstats)[iter2] != wordstatscolnames[iter2]) {
                    wordstats = data.frame(wordstats[1:(iter2 - 1)], NA, wordstats[iter2:dim(wordstats)[2]])
               }
          }
     }
     wordstats[28:63] = otherstats
     colnames(wordstats) = wordstatscolnames
     wordstats$all = as.character(wordstats$all)  # convert 'factor' to 'character' so 'review_id' can be added (below)
     wordstats$all = revs$review_id[iter]
     temparray = rbind.data.frame(temparray, wordstats)
     if (iter %% 100 == 0) {print(iter)}
}
# proc.time() - starttime
saveRDS(temparray, file = "temparray.rds")
temparray <- readRDS("temparray.rds")
temparray = temparray[-1, ]
rownames(temparray) = seq(from = 1, to = dim(temparray)[1])
# table(revs$review_id == temparray$all)    # checks that rows match between 'revs' and 'temparrayall'
revsall = cbind(revs, temparray)
saveRDS(revsall, file = "revsall.rds")


#--------------------------
# add word frequencies to reviews data frame

library(tm)

revsall <- readRDS("revsall.rds")

# takes a vector of type 'character' (i.e., strings) and returns a vector of 
#    numbers of the words/tokens contained within original vector
corpfreqprep <- function(vectoroftexts, removepunct = TRUE) {
     corpus <- Corpus(VectorSource(vectoroftexts))
     corpus <- tm_map(corpus, content_transformer(removeNumbers))
     corpus <- tm_map(corpus, content_transformer(stripWhitespace))
     if (removepunct) corpus <- tm_map(corpus, content_transformer(removePunctuation))
     corpfreq <- colSums(as.matrix(DocumentTermMatrix(corpus)))
     corpfreq <- sort(corpfreq, decreasing = TRUE)
     return(corpfreq)
}

# frequencies of all words used across all reviews (not adjusted for length of reviews)
alltextsfreq <- corpfreqprep(revsall$text, removepunct = TRUE)

# remove words (mostly proper nouns) that might not generalize well to other data sets
# 331 vegas, 393 yelp, 543 phoenix, 561 las, 660 scottsdale, 753 groupon, 1047 arizona
# 1051 mayo, 1209 chandler, 1217 summerlin, 1331 john, 1333 nevada, 1385 miller
# 1426 gilbert, 1317 north, 1407 west
wordstoremove <- c(331, 393, 543, 561, 660, 753, 1047, 1051, 1209, 1217, 1317,
                   1331, 1333, 1385, 1407, 1426)
alltextsfreq <- alltextsfreq[-wordstoremove]

# selects words that appear at least 100 times across all reviews
# using 100 as cut-off is somewhat arbitrary
commonwordsnum <- match(99, alltextsfreq) - 1
commonwords <- names(alltextsfreq)[1:commonwordsnum]

# add columns to data frame for each common word's frequency in each review
revsallcolnum <- dim(revsall)[2]
revsall[(revsallcolnum + 1):(revsallcolnum + commonwordsnum)] <- NA
colnames(revsall)[(revsallcolnum + 1):(revsallcolnum + commonwordsnum)] <- commonwords

# for each row in 'revsall', i.e., for each review
for (iter in 1:dim(revsall)[1]) {
     if (iter %% 100 == 0) {print(iter)}
     tempfreq <- corpfreqprep(revsall$text[iter], removepunct = TRUE)
     # if 'corpfreqprep' returns 'logical(0)' to 'tempfreq', assigning "" to 'tempfreq'
     #    avoid subsequent error
     # this error occurs 8 times in the data set at these row indices in 'revsall':
     #    1711, 1954, 6760, 8069, 8417, 8486, 9783, 10902
     # the review texts at these row indices are:  "Ok|", "|", ".", "|", "c|", "A|", "NT|", ":)|"
     # these errors do not affect the totals for the most common words, so the
     #    correction below is adequate
     if (length(tempfreq) == 0) {
          tempfreq <- 0
          names(tempfreq) <- ""
     }
     # for each word counted in a review
     for (iter2 in 1:length(tempfreq)) {
          # if the word counted in the review is in the common words
          if (names(tempfreq)[iter2] %in% commonwords) {
               wordcoltemp <- which(colnames(revsall)[(revsallcolnum + 1):dim(revsall)[2]] ==
                                    names(tempfreq)[iter2])
               wordcol <- wordcoltemp + revsallcolnum
               # save the frequency of the word as a proportion of the number of words in the review
               revsall[iter, wordcol] <- tempfreq[iter2] / revsall$n.words[iter]
          }
     }
}

saveRDS(revsall, file = "revsallwordfreqs.rds")

#--------------------------
# modify variables and remove unneeded variables

expdata <- readRDS("revsallwordfreqs.rds")

# checks word frequencies in 'expdata' against word frequencies in 'alltextsfreq'
# to make sure data were added to 'revsall'/'expdata' correctly
# startcol <- 73
# stopcol <- dim(expdata)[2]
# coltotals <- rep(NA, length(startcol:stopcol))
# nwordscolnum <- 12
# for (colnum in startcol:stopcol){
#      rowwordtotal <- 0
#      for (iter in 1:dim(expdata)[1]) {
#           if (!is.na(expdata[iter, colnum])) {
#                rowwordnum <- expdata[iter, colnum] * expdata[iter, nwordscolnum]
#                rowwordtotal <- rowwordtotal + rowwordnum
#           }
#      }
#      coltotals[colnum - (startcol - 1)] <- rowwordtotal
# }
# table(coltotals == alltextsfreq[1:commonwordsnum])     # all 1462 should be TRUE; should be no FALSEs

# 'ave.polarity' is the total polarity divided by the number of sentences, but
# the sentence number counts are unreliable.  Therefore, 'ave.polarity' will be
# transformed to be polarity per word (instead of "polarity per sentence").
# plot(abs(revsall$n.words - revsall$total.words)/mean(c(revsall$n.words, revsall$total.words), na.rm=T))
expdata$ave.polarity = (expdata$ave.polarity * expdata$total.sentences) / expdata$total.words

# replace 'NA's with zeroes
expdata[is.na(expdata)] <- 0

# word counts for different functions don't completely agree
# columns of word counts:  12, 37, 52, 56, 60, 66
# cor(expdata[,c(12, 37, 52, 56, 60, 66)], use = "pairwise.complete.obs")
# correlation matrix and graphs show the agreement is very strong -- strong enough to delete extra columns
# table(revsall$n.words == revsall$total.words)

# delete: 37, 52, 56, 60, 66: columns of word counts
# delete 9, review text
# delete column 10, duplicate of 'review_id'
# delete 11 'n.sent':  number of sentences is often incorrect
# delete 16:19:  all 'per sentence' stats, which are often incorrect
# delete 28:32:  proportions of all sentences, which are often incorrect
# delete 33:36:  unsure how unique words would be useful; deleting
# delete 54:55:  polarity standard deviation is by sentence count, which is often incorrect
# delete 56:59:  readability relies on sentence counts, which are often incorrect; can I calculate something useful from this?
# delete 63, 65:  shannon, collision, brillouin correlate very highly; keep shannon
# delete 66, 68:70:  'ave.content.rate' (col 67) is good (expressed as %); other cols unnecessary
# delete 71:  'formality' (col 72) is good (expressed as %); 71 is unnecessary
cullcols = c(9:11, 16:19, 28:32, 33:37, 51:52, 54:60, 63, 65:66, 68:71)
expdata = expdata[ , -cullcols]

saveRDS(expdata, file = "revsfeaturesculled.rds")


#--------------------------
# barplot of ratings by 'alternative' or 'conventional' medicine 

expdata <- readRDS("revsfeaturesculled.rds")

png('ratings.png', width=640, height=512)
plotdata = expdata
starsbymed = cbind(as.data.frame(table(plotdata[plotdata$medicine == "conventional", ]$stars)),
                   as.data.frame(table(plotdata[plotdata$medicine == "alternative", ]$stars)))
starsbymed = starsbymed[ , -c(1, 3)]
starsmedprop = starsbymed
starsmedprop[ , 1] = starsmedprop[ , 1] / sum(starsmedprop[ , 1])
starsmedprop[ , 2] = starsmedprop[ , 2] / sum(starsmedprop[ , 2])
colnames(starsmedprop) = c("conventional", "alternative")

barplot(t(as.matrix(starsmedprop)), beside = T, ylim = c(0, 1), col = c(8, 9),
        xlab = c("Number of stars"), ylab = c("Proportion of reviews"),
        names.arg = c("1", "2", "3", "4", "5"),
        main = "Yelp reviewer ratings of medical establishments")
legend(x = 1, y = 0.98, legend = colnames(starsmedprop), fill = c(8, 9))
abline(h = 0)  
dev.off()


#--------------------------
# splitting data into training (60% of data) and testing (40%) sets, stratified
# by 'medicine' ('conventional' or 'alternative') and star ratings (1 - 5)

expdata <- readRDS("revsfeaturesculled.rds")

var1 <- levels(expdata$medicine)
var2 <- sort(unique(expdata$stars))
testrows <- NA
set.seed(43317)

for (iter1 in 1:length(var1)) {
     for (iter2 in 1:length(var2)) {
          set <- which(expdata$medicine == var1[iter1] & expdata$stars == var2[iter2])
          sampledrows <- sample(set, size = round(0.4 * length(set)), replace = FALSE)
          testrows <- append(testrows, sampledrows)          
     }
}
testrows <- testrows[-1]

traindata <- expdata[-testrows, ]
testdata <- expdata[testrows, ]

saveRDS(traindata, file = 'traindata.rds')
saveRDS(testdata, file = 'testdata.rds')


#--------------------------
# tree model on training set

traindata <- readRDS('traindata.rds')
testdata <- readRDS('testdata.rds')

# column indices for data to be included in analysis
# column 4 is the classification label:  conventional or alternative medicine
# columns 5 - 39 are Yelp data and the standard analyses from the package 'qdap'
# the remaining columns are word usage rates
response_col <- 4
predictor_cols <- 5:dim(traindata)[2]

library(randomForest)

# when sampsize = # cases in class (n), and as 'n' grows larger, the number of
#   unique samples in 'sampsize' converges toward 1 - 1/e (~63.2%)
sampsize <- min(table(traindata[ , response_col]))
n_trees <- 5000
print_interval <- ceiling(n_trees / 100)

# parameter search for 'mtry'
set.seed(22640171)
mtry_results <- tuneRF(y=traindata[ , response_col],
                       x=traindata[ , predictor_cols],
                       sampsize=c(sampsize, sampsize),
                       strata=traindata[ , response_col],
                       ntreeTry=n_trees,
                       doBest=F, importance=T, plot=F, do.trace=print_interval)
write.csv(mtry_results, 'mtry_oob_error.csv')
mtry_results <- read.csv('mtry_oob_error.csv', header=T, row.names=1)
mtry <- mtry_results[mtry_results[ , 2] == min(mtry_results[ , 2]), 1]

# train on all training data
set.seed(83875501)
rfmodel <- randomForest(y=traindata[ , response_col],
                        x=traindata[ , predictor_cols],
                        sampsize=c(sampsize, sampsize),
                        strata=traindata[ , response_col],
                        ntree=n_trees, mtry=mtry,
                        keep.inbag=T, keep.forest=T,
                        importance=T, do.trace=print_interval)

saveRDS(rfmodel, file='random_forest_model.rds')


#--------------------------
# evaluate model

library(randomForest)
library(caret)
library(ROCR)

response_col <- 4
traindata <- readRDS('traindata.rds')
testdata <- readRDS('testdata.rds')
rfmodel <- readRDS('random_forest_model.rds')


get_confusion_elements <- function(caret_confusion_matrix) {
     tp <- as.numeric(caret_confusion_matrix$table[1])   # true positives
     fn <- as.numeric(caret_confusion_matrix$table[2])   # false negatives
     fp <- as.numeric(caret_confusion_matrix$table[3])   # false positives
     tn <- as.numeric(caret_confusion_matrix$table[4])   # true negatives
     return( c(tp, fp, tn, fn) )
}

calculate_mcc <- function(tp, fp, tn, fn) {
     # calculates Matthews correlation coefficient
     # tp - true positives
     # fp - false positives
     # tn - true negatives
     # fn - false negatives
     mcc <- ((tp * tn) - (fp * fn)) /
            sqrt( (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) )
     return(mcc)
}

get_mcc <- function(caret_confusion_matrix) {
     # calculates Matthews correlation coefficient from confusion matrix
     ce <- get_confusion_elements(caret_confusion_matrix)
     mcc <-calculate_mcc(ce[1], ce[2], ce[3], ce[4])
     return(mcc)
}
     
calculate_auc_one_classifier<- function(recall_sensitivity_tpr,
                                        specificity_tnr) {
     a <- recall_sensitivity_tpr
     b <- 1 - specificity_tnr
     # a*b/2 + a*(1-b) + (1-a)*(1-b)/2 reduces to:
     auc <- a/2 - b/2 + 1/2
     return(auc)
}

calculate_f1_score <- function(precision, recall, beta=1) {
     # calculates F1 score from precision, recall, and beta
     f1_score <- unname( ( (1 + beta^2) * recall * precision ) /
                         ( (recall + precision) * beta^2) )
     return(f1_score)
}
     
get_f1_score <- function(caret_confusion_matrix) {
     # calculates F1 score from confusion matrix
     sensitivity_recall <- caret_confusion_matrix$byClass[1]
     ppv_precision <- caret_confusion_matrix$byClass[3]
     f1_score <- calculate_f1_score(ppv_precision, sensitivity_recall)
     return(f1_score)
}

calculate_informedness <- function(sensitivity_recall_tpr, specificity_tnr) {
     # for dichotomous outcomes, informedness is also known as Youden's J and
     #    as Delta P'
     informedness <- sensitivity_recall_tpr + specificity_tnr - 1
     return(informedness)
}

get_informedness <- function(caret_confusion_matrix) {
     # calculates informedness from confusion matrix
     sensitivity_recall_tpr <- caret_confusion_matrix$byClass[1]
     specificity_tnr <- caret_confusion_matrix$byClass[2]
     inform <- unname(calculate_informedness(sensitivity_recall_tpr,
                                             specificity_tnr))
     return(inform)
}

calculate_markedness <- function(ppv_precision, npv) {
     # for dichotomous outcomes, markedness is also known as Delta P
     markedness <- ppv_precision + npv - 1
     return(markedness)
}

get_markedness <- function(caret_confusion_matrix) {
     # calculates markedness from confusion matrix
     ppv_precision <- caret_confusion_matrix$byClass[3]
     npv <- caret_confusion_matrix$byClass[4]
     markedness <- unname(calculate_markedness(ppv_precision, npv))
     return(markedness)
}

get_metrics <- function(confusion_matrix) {
     inform <- get_informedness(confusion_matrix)
     mark <- get_markedness(confusion_matrix)
     mcc <- get_mcc(confusion_matrix)
     f1 <- get_f1_score(confusion_matrix)
     return( c(inform, mark, mcc, f1) )
}


# confusionMatrix(rfmodel$predicted, traindata[ , response_col])
# random forest OOB classification
rf_oob_classify <- rfmodel$predicted

# random forest classification
rf_classify <- predict(rfmodel, testdata)

class_table <- table(traindata[ , response_col])
class_table_prop <- class_table / sum(class_table)

# random classification, 50%-50%
set.seed(12247081)
random_classify_5050 <- sample(names(class_table), dim(testdata)[1], replace=T)

# random classification, imbalanced (skewed)
set.seed(32571012)
random_classify_skew <- sample(names(class_table), dim(testdata)[1], replace=T,
                               prob=class_table_prop)

# uniform majority classification
max_class_n <- max(class_table)
max_class_name <- names(which(class_table == max_class_n))
uniform_classify <- rep(max_class_name, dim(testdata)[1])

# uniform majority classification, except for one case
min_class_n <- min(class_table)
min_class_name <- names(which(class_table == min_class_n))
almost_uniform_classify <- uniform_classify
almost_uniform_classify[1] <- min_class_name


classify_list <- list(rf_oob_classify, rf_classify, random_classify_5050,
                      random_classify_skew, uniform_classify,
                      almost_uniform_classify)

confusion_matrices <- vector('list', length(classify_list))
col_names <- c('sensitivity', 'specificity', 'inform', 'mark', 'mcc', 'f1',
               'auc')
metrics <- data.frame(matrix(nrow=length(classify_list),
                             ncol=length(col_names)))
rownames(metrics) <- c('rf_oob', 'rf', 'random_50-50', 'random_skew',
                       'uniform', 'almost_uniform')
colnames(metrics) <- col_names
for (i in 1:length(classify_list)) {
     if (i == 1) {
         confusion_matrices[[i]] <- confusionMatrix(classify_list[[i]],
                                                    traindata[ , response_col])
     } else {
         confusion_matrices[[i]] <- confusionMatrix(classify_list[[i]],
                                                    testdata[ , response_col])
     }
    metrics[i, 1] <- confusion_matrices[[i]]$byClass[1]
    metrics[i, 2] <- confusion_matrices[[i]]$byClass[2]
    metrics[i, 3:(dim(metrics)[2]-1)] <- get_metrics(confusion_matrices[[i]])
    metrics[i, dim(metrics)[2]] <- calculate_auc_one_classifier(metrics[i, 1],
                                                                metrics[i, 2])
}




rf_classify_prob <- predict(rfmodel, testdata, type='prob')
rf_test_pred <- prediction(rf_classify_prob[ , 1], testdata[ , response_col],
                      c(max_class_name, min_class_name))
rf_test_roc <- performance(rf_test_pred, 'tpr', 'fpr')

png('rf_roc_auc.png', width=640, height=640)
plot(rf_test_roc, col=2)
points(y=metrics$sensitivity[-1], x=1-metrics$specificity[-1], pch=16)
points(y=metrics[2, 1], x=1-metrics[2, 2], pch=16, col=2)
abline(0, 1, lty=2)
lines(y=c(0, metrics$sensitivity[2]), x=c(0, 1-metrics$specificity[2]), col=2)
lines(y=c(1, metrics$sensitivity[2]), x=c(1, 1-metrics$specificity[2]), col=2)
dev.off()

# add OOB to plot
# rf_oob_pred <- prediction(rfmodel$votes[ , 1], rfmodel$y,
#                       c(max_class_name, min_class_name))
# rf_oob_roc <- performance(rf_oob_pred, 'tpr', 'fpr')
# points(y=metrics[1, 1], x=1-metrics[1, 2], pch=16, col=3)
# plot(rf_oob_roc, add=T, col=3)


# AUC
performance(rf_test_pred, 'auc')@y.values[[1]]


# verifying on plot that random forest model defaulted to 0.50 voting threshold
# a = cbind(rf_test_roc@x.values[[1]],
#           rf_test_roc@y.values[[1]],
#           rf_test_roc@alpha.values[[1]])
# plot(rf_test_roc, col=2, ylim=c(0.6, 1), xlim=c(0, 0.15))
# points(y=metrics[2, 1], x=1-metrics[2, 2], pch=16, col=2)
# points(y=a[934, 2], x=a[934, 1])
# dots overlap, so yes, the voting threshold was 0.50


get_y_intercept <- function(x, y, slope) {
     return(y - (slope * x))
}

slope <- max_class_n / min_class_n
intercept <- get_y_intercept(1 - metrics[2, 2], metrics[2, 1], slope)

png('rf_roc_isocost.png', width=640, height=512)
plot(rf_test_roc, col=2)
points(y=metrics$sensitivity[-1], x=1-metrics$specificity[-1], pch=16)
points(y=metrics[2, 1], x=1-metrics[2, 2], pch=16, col=2)
abline(0, 1, lty=2)
abline(intercept, slope)
dev.off()



rf_test_mcc <- performance(rf_test_pred, 'mat')
mcc_max <- max(rf_test_mcc@y.values[[1]], na.rm=T)
mcc_max_idx <- which(rf_test_mcc@y.values[[1]] == mcc_max)
mcc_max_threshold <- rf_test_mcc@x.values[[1]][mcc_max_idx]

png('rf_mcc_threshold.png', width=640, height=512)
plot(rf_test_mcc)
abline(v=mcc_max_threshold, lty=2)
dev.off()




thresholds <- rf_test_roc@alpha.values[[1]]
# min(abs(thresholds - mcc_max_threshold))
# plot( abs(thresholds - mcc_max_threshold) ~ c(1:2812), xlim=c(810, 820), ylim=c(0, 0.05))
thresholds_idx <- which(thresholds == mcc_max_threshold)
roc_y <- rf_test_roc@y.values[[1]][thresholds_idx]
roc_x <- rf_test_roc@x.values[[1]][thresholds_idx]

png('rf_roc_max_mcc.png', width=640, height=512)
plot(rf_test_roc, col=2, ylim=c(0.6, 1), xlim=c(0, 0.10))
points(y=metrics[2, 1], x=1-metrics[2, 2], pch=16, col=2, cex=2)
abline(intercept, slope)
points(y=roc_y, x=roc_x, pch=16, col=3, cex=2)
dev.off()



rf_predict_optimal_threshold <- as.factor(rf_classify_prob[ , 1] < mcc_max_threshold)
levels(rf_predict_optimal_threshold) <- c(min_class_name, max_class_name)
rf_confusion_optimal_threshold <- confusionMatrix(rf_predict_optimal_threshold,
                                                  testdata[ , response_col])
rf_optimized_metrics <- metrics[2:3, 1:dim(metrics)[2]-1]
rownames(rf_optimized_metrics)[2] <- 'rf_optimized'
rf_optimized_metrics[2, 1] <- rf_confusion_optimal_threshold$byClass[1]
rf_optimized_metrics[2, 2] <- rf_confusion_optimal_threshold$byClass[2]
rf_optimized_metrics[2, 3:6] <- get_metrics(rf_confusion_optimal_threshold)

# plots informedness on ROC plot
# points(y=rf_optimized_metrics$inform[1], x=1-rf_optimized_metrics$specificity[1], col=2)
# points(y=rf_optimized_metrics$inform[2], x=1-rf_optimized_metrics$specificity[2], col=3)



rf_test_prc <- performance(rf_test_pred, 'prec', 'rec')

png('rf_prc_max_mcc.png', width=640, height=512)
plot(rf_test_prc, col=2, ylim=c(0, 1))
# plot(rf_test_prc, col=2, xlim=c(0.75, 0.9), ylim=c(0.75, 0.9))
min_prevalence <- min(class_table_prop)
abline(h=min_prevalence, lty=2)
points(y=confusion_matrices[[2]]$byClass[3],
       x=confusion_matrices[[2]]$byClass[1],
       col=2, pch=16)
points(y=rf_confusion_optimal_threshold$byClass[3],
       x=rf_confusion_optimal_threshold$byClass[1],
       col=3, pch=16)
points(y=rf_optimized_metrics$mark[1], rf_optimized_metrics$inform[1], col=2)
points(y=rf_optimized_metrics$mark[2], rf_optimized_metrics$inform[2], col=3)
dev.off()




sink(file = "rf_output.txt")
rfmodel
rf_confusion_optimal_threshold 
metrics
rf_optimized_metrics
sink(file = NULL)

png('rf_variable_importance.png', width=640, height=640)
varImpPlot(rfmodel, n.var=30, main='Variable Importance', pch=16, cex=1)
dev.off()

png('rf_oob_error_by_tree.png', width=640, height=512)
plot(rfmodel)
dev.off()


var_imp <- importance(rfmodel)
var_imp <- var_imp[order(var_imp[ , 4], decreasing=T), ]



library(dplyr)

expdata <- readRDS('revsfeaturesculled.rds')
traindata <- readRDS('traindata.rds')

con_alt_means <- expdata[ , -c(1:3)] %>%
     group_by(medicine) %>%
     summarise_each(funs(mean))

con_alt_means <- as.data.frame(t(con_alt_means[ , -1]))
colnames(con_alt_means) <- c('alternative', 'conventional')
con_alt_means$alt_con_ratio <- (con_alt_means$alternative /
                                con_alt_means$conventional)
con_alt_means$con_alt_ratio <- (con_alt_means$conventional /
                                con_alt_means$alternative)

write.table(con_alt_means, 'con_alt_means.txt', sep='\t\t\t', row.names=T)


con_alt_means_train <- traindata[ , -c(1:3)] %>%
     group_by(medicine) %>%
     summarise_each(funs(mean))

con_alt_means_train <- as.data.frame(t(con_alt_means_train[ , -1]))
colnames(con_alt_means_train) <- c('alternative', 'conventional')
con_alt_means_train$alt_con_ratio <- (con_alt_means_train$alternative / 
                                      con_alt_means_train$conventional)
con_alt_means_train$con_alt_ratio <- (con_alt_means_train$conventional / 
                                      con_alt_means_train$alternative)

write.table(con_alt_means_train, 'con_alt_means_train.txt', sep='\t\t\t',
            row.names=T)