Model Topics: Code

Here are the Anaconda ‘environment.yml’ specifications:

name: gensim_nltk_pandas
dependencies:
- boto=2.47.0=py36_0
- bz2file=0.98=py36_0
- gensim=2.2.0=np113py36_0
- libgfortran=3.0.0=1
- mkl=2017.0.3=0
- nltk=3.2.4=py36_0
- numpy=1.13.1=py36_0
- openssl=1.0.2l=0
- pandas=0.20.2=np113py36_0
- pip=9.0.1=py36_1
- python=3.6.2=0
- python-dateutil=2.6.0=py36_0
- pytz=2017.2=py36_0
- readline=6.2=2
- requests=2.14.2=py36_0
- scipy=0.19.1=np113py36_0
- setuptools=27.2.0=py36_0
- six=1.10.0=py36_0
- smart_open=1.5.3=py36_0
- sqlite=3.13.0=0
- tk=8.5.18=0
- wheel=0.29.0=py36_0
- xz=5.2.2=1
- zlib=1.2.8=3
- pip:
- smart-open==1.5.3

Here is the code:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#! /usr/bin/env python3


def get_sibling_directory_path(sibling_directory_name):
    '''
    returns path for a specified folder that is in the same parent directory as
        the current working directory
    '''

    import os

    current_path = os.getcwd()
    last_separator_position = current_path.rfind(os.sep)
    parent_directory_path = current_path[0:last_separator_position]
    sibling_directory_path = os.path.join(parent_directory_path,
                                        sibling_directory_name)
    return(sibling_directory_path)


def get_dictionary_list():
    '''
    Retrieves list of all English dictionary words
    '''

    from nltk.corpus import words

    word_list = words.words()

    return(word_list)


def stem_words(word_list):
    '''
    Stems words in 'word_list' and returns resulting list
    '''

    from nltk.stem.lancaster import LancasterStemmer

    nltk_lancaster = LancasterStemmer
    stemmer = nltk_lancaster()
    stemmed = [stemmer.stem(w) for w in word_list]

    return(stemmed)


def extract_topic_words(topic_words_n, lda_model):
    '''
    Extracts the most probable words of each topic from 'lda_model' and their
        corresponding probabilities

    'topic_words_n' - number of words to extract for each topic
    'lda_model' - trained Gensim LdaModel object
    '''

    topic_words = lda_model.print_topics(num_words=topic_words_n, num_topics=50)
    words_info = [e[1] for e in topic_words]

    words = []
    numbers = []

    for i in range(len(words_info)):

        quotes_idx = [p for p, q in enumerate(words_info[i]) if q == '\"']
        q_start = quotes_idx[::2]
        q_end = quotes_idx[1::2]
        ws = [words_info[i][q_start[j]+1:q_end[j]] for j in range(len(q_start))]
        words.append(ws)

        plus_idx = [p for p, q in enumerate(words_info[i]) if q == '+']
        asterisk_idx = [p for p, q in enumerate(words_info[i]) if q == '*']
        ns = [float( words_info[i][0:asterisk_idx[0]] )]
        ns2 = [float( words_info[i][plus_idx[j]+2:asterisk_idx[j+1]] )
            for j in range(len(plus_idx))]
        ns.extend(ns2)
        numbers.append(ns)

    return(words, numbers)


def stem_groups_by_topic(topic_words, stem_groups, topic_probs):
    '''
    '''

    import pandas as pd

    rows = []
    stem_list = stem_groups.iloc[:, 0].tolist()    # .str.match() not returning only exact match

    for i in range(len(topic_words)):
        for j in range(len(topic_words[i])):

            if topic_words[i][j] in stem_list:
                idx = stem_list.index(topic_words[i][j])
                non_stems = stem_groups.ix[idx, 1]
            else:
                non_stems = ''

            rows.append([i, topic_probs[i][j], topic_words[i][j], non_stems])

    col_names = ['topic', 'probs', 'stem', 'non_stemmed']
    stem_groups_topic = pd.DataFrame.from_records(rows, columns=col_names)

    return(stem_groups_topic)


def write_list_to_text_file(a_list, text_file_name, overwrite_or_append = 'a'):
    '''
    writes a list of strings to a text file
    appends by default; change to overwriting by setting to 'w' instead of 'a'
    '''

    try:
        textfile = open(text_file_name, overwrite_or_append, encoding = 'utf-8')
        for element in a_list:
            textfile.write(element)
            textfile.write('\n')

    finally:
        textfile.close()


def main():
    '''
    Gets most probable (stemmed) words for each topic in a Gensim LDA model
        and matches them to non-stemmed words from which they might have been
        derived
    '''

    import os
    import pandas as pd
    from gensim.models.ldamodel import LdaModel

    # get English dictionary words and their stemmed versions
    word_list = get_dictionary_list()
    stemmed_list = stem_words(word_list)

    col0_name = 'stemmed'
    col1_name = 'words'
    word_df = pd.DataFrame({col1_name: word_list, col0_name: stemmed_list})
    stem_groups = (word_df.groupby(col0_name, as_index=False)[[col1_name]]
                .aggregate(lambda x: list(x)))

    # get LDA model
    lda_folder = '45_lda_train_interval5'
    lda_path = get_sibling_directory_path(lda_folder)
    model_num = 'model102'
    lda_path = os.path.join(lda_path, model_num)
    lda_model_filename = 'lda_' + model_num
    lda_model_filepath = os.path.join(lda_path, lda_model_filename)
    lda = LdaModel.load(lda_model_filepath)

    # get most probable words for each topic in LDA model
    topic_words_n = 50
    topic_words, topic_probs = extract_topic_words(topic_words_n, lda)

    #topic_words_flat = [e for sublist in topic_words for e in sublist]
    #topic_stem_groups = (stem_groups[stem_groups[col0_name]
    #                                 .isin(topic_words_flat)])
    stem_groups_topic = stem_groups_by_topic(topic_words, stem_groups,
                                            topic_probs)

    # save information
    output_filepath = os.path.join(lda_path, 'stem_reference.csv')
    word_df.to_csv(output_filepath, index=False)

    output_filepath = os.path.join(lda_path, 'stem_reference_grouped.csv')
    stem_groups.to_csv(output_filepath, index=False)

    output_filepath = os.path.join(lda_path, 'stem_groups_by_topic.csv')
    stem_groups_topic.to_csv(output_filepath, index=False)
    #write_list_to_text_file(topic_words_flat, 'topic_words_flat.txt')
    #topic_stem_groups.to_csv('topic_stem_groups.csv', index=False)


if __name__ == '__main__':
    main()