Plot Model Topics: Code

Here is the code:

library(ggplot2)


capitalize <- function(a_string) {
    # capitalize each word in a string, including where words are separated
    #    by a period '.'
    a_string <- gsub('\\.', ' ', a_string)
    separator = ' '
    sub_str <- strsplit(a_string, separator)[[1]]
    capitalized <-paste(toupper(substring(sub_str, 1, 1)), substring(sub_str, 2),
                        sep='', collapse='.')
    return(capitalized)
}


file_name <- 'topic_labels.csv'
data_table <- read.csv(file_name, header=T)
data_series <- data.frame(data_table[ , 3])
colnames(data_series) <- 'value'

labels <- c('movie television', 'politics', 'music',
    'government units', 'football soccer', 'unclear',
    'performing arts writing', 'war', 'unclear', 'unclear',
    'wikipedia editing', 'football basketball hockey', 'unclear',
    'wikipedia editing', 'sports', 'editing SE Asia', 'history',
    'picture copyright', 'local geography building', 'wikipedia editing')


labels <- sapply(labels, capitalize)

# re-order columns so that most important/relevant are first
topic_order <- c(1:5, 7, 8, 12, 15, 17, 19, 11, 14, 20, 18, 16, 6, 9, 10, 13)
data_series[ , 1] <- rev(data_series[topic_order, 1])
labels <- rev(labels[topic_order])

# colors for each data series
colorful_col_n <- 11
series_colors <- rev(c(rainbow(n=colorful_col_n),
                    rep('gray', dim(data_series)[1] - colorful_col_n)))
names(series_colors) <- labels

# add x-axis plotting index
idx_data_series <- data.frame(cbind(1:dim(data_series)[1], data_series))
colnames(idx_data_series)[1] <- 'index'


bkgd <- 'gray20'
ggplot(idx_data_series, aes(x=index, y=value, fill=labels)) +
    geom_bar(stat='identity') +
    coord_flip() +
    scale_fill_manual(values=series_colors) +
    theme(panel.background=element_rect(fill=bkgd)) +
    theme(legend.position='none') +
    theme(panel.grid.major=element_line(color='gray27')) +
    theme(panel.grid.minor=element_blank()) +
    labs(title='Cv Coherences of Wikipedia Topics') +
    theme(plot.title=element_text(hjust=0.5)) +
    ylab('Cv Coherences') +
    xlab('Topics') +
    scale_x_continuous(breaks=1:dim(data_series)[1], labels=labels)

output_file <- file.path(getwd(), 'topic_coherences_model102.png')
ggsave(output_file, width=11.5, height=7)