Model Topics: Code – Andrew Fairless, Ph.D.

Here are the Anaconda ‘environment.yml’ specifications:

name: gensim_nltk_pandas
dependencies:
- boto=2.47.0=py36_0
- bz2file=0.98=py36_0
- gensim=2.2.0=np113py36_0
- libgfortran=3.0.0=1
- mkl=2017.0.3=0
- nltk=3.2.4=py36_0
- numpy=1.13.1=py36_0
- openssl=1.0.2l=0
- pandas=0.20.2=np113py36_0
- pip=9.0.1=py36_1
- python=3.6.2=0
- python-dateutil=2.6.0=py36_0
- pytz=2017.2=py36_0
- readline=6.2=2
- requests=2.14.2=py36_0
- scipy=0.19.1=np113py36_0
- setuptools=27.2.0=py36_0
- six=1.10.0=py36_0
- smart_open=1.5.3=py36_0
- sqlite=3.13.0=0
- tk=8.5.18=0
- wheel=0.29.0=py36_0
- xz=5.2.2=1
- zlib=1.2.8=3
- pip:
- smart-open==1.5.3

Here is the code:

#! /usr/bin/env python3


def get_sibling_directory_path(sibling_directory_name):
    '''
    returns path for a specified folder that is in the same parent directory as
        the current working directory
    '''

    import os

    current_path = os.getcwd()
    last_separator_position = current_path.rfind(os.sep)
    parent_directory_path = current_path[0:last_separator_position]
    sibling_directory_path = os.path.join(parent_directory_path,
                                        sibling_directory_name)
    return(sibling_directory_path)


def get_dictionary_list():
    '''
    Retrieves list of all English dictionary words
    '''

    from nltk.corpus import words

    word_list = words.words()

    return(word_list)


def stem_words(word_list):
    '''
    Stems words in 'word_list' and returns resulting list
    '''

    from nltk.stem.lancaster import LancasterStemmer

    nltk_lancaster = LancasterStemmer
    stemmer = nltk_lancaster()
    stemmed = [stemmer.stem(w) for w in word_list]

    return(stemmed)


def extract_topic_words(topic_words_n, lda_model):
    '''
    Extracts the most probable words of each topic from 'lda_model' and their
        corresponding probabilities

    'topic_words_n' - number of words to extract for each topic
    'lda_model' - trained Gensim LdaModel object
    '''

    topic_words = lda_model.print_topics(num_words=topic_words_n, num_topics=50)
    words_info = [e[1] for e in topic_words]

    words = []
    numbers = []

    for i in range(len(words_info)):

        quotes_idx = [p for p, q in enumerate(words_info[i]) if q == '\"']
        q_start = quotes_idx[::2]
        q_end = quotes_idx[1::2]
        ws = [words_info[i][q_start[j]+1:q_end[j]] for j in range(len(q_start))]
        words.append(ws)

        plus_idx = [p for p, q in enumerate(words_info[i]) if q == '+']
        asterisk_idx = [p for p, q in enumerate(words_info[i]) if q == '*']
        ns = [float( words_info[i][0:asterisk_idx[0]] )]
        ns2 = [float( words_info[i][plus_idx[j]+2:asterisk_idx[j+1]] )
            for j in range(len(plus_idx))]
        ns.extend(ns2)
        numbers.append(ns)

    return(words, numbers)


def stem_groups_by_topic(topic_words, stem_groups, topic_probs):
    '''
    '''

    import pandas as pd

    rows = []
    stem_list = stem_groups.iloc[:, 0].tolist()    # .str.match() not returning only exact match

    for i in range(len(topic_words)):
        for j in range(len(topic_words[i])):

            if topic_words[i][j] in stem_list:
                idx = stem_list.index(topic_words[i][j])
                non_stems = stem_groups.ix[idx, 1]
            else:
                non_stems = ''

            rows.append([i, topic_probs[i][j], topic_words[i][j], non_stems])

    col_names = ['topic', 'probs', 'stem', 'non_stemmed']
    stem_groups_topic = pd.DataFrame.from_records(rows, columns=col_names)

    return(stem_groups_topic)


def write_list_to_text_file(a_list, text_file_name, overwrite_or_append = 'a'):
    '''
    writes a list of strings to a text file
    appends by default; change to overwriting by setting to 'w' instead of 'a'
    '''

    try:
        textfile = open(text_file_name, overwrite_or_append, encoding = 'utf-8')
        for element in a_list:
            textfile.write(element)
            textfile.write('\n')

    finally:
        textfile.close()


def main():
    '''
    Gets most probable (stemmed) words for each topic in a Gensim LDA model
        and matches them to non-stemmed words from which they might have been
        derived
    '''

    import os
    import pandas as pd
    from gensim.models.ldamodel import LdaModel

    # get English dictionary words and their stemmed versions
    word_list = get_dictionary_list()
    stemmed_list = stem_words(word_list)

    col0_name = 'stemmed'
    col1_name = 'words'
    word_df = pd.DataFrame({col1_name: word_list, col0_name: stemmed_list})
    stem_groups = (word_df.groupby(col0_name, as_index=False)[[col1_name]]
                .aggregate(lambda x: list(x)))

    # get LDA model
    lda_folder = '45_lda_train_interval5'
    lda_path = get_sibling_directory_path(lda_folder)
    model_num = 'model102'
    lda_path = os.path.join(lda_path, model_num)
    lda_model_filename = 'lda_' + model_num
    lda_model_filepath = os.path.join(lda_path, lda_model_filename)
    lda = LdaModel.load(lda_model_filepath)

    # get most probable words for each topic in LDA model
    topic_words_n = 50
    topic_words, topic_probs = extract_topic_words(topic_words_n, lda)

    #topic_words_flat = [e for sublist in topic_words for e in sublist]
    #topic_stem_groups = (stem_groups[stem_groups[col0_name]
    #                                 .isin(topic_words_flat)])
    stem_groups_topic = stem_groups_by_topic(topic_words, stem_groups,
                                            topic_probs)

    # save information
    output_filepath = os.path.join(lda_path, 'stem_reference.csv')
    word_df.to_csv(output_filepath, index=False)

    output_filepath = os.path.join(lda_path, 'stem_reference_grouped.csv')
    stem_groups.to_csv(output_filepath, index=False)

    output_filepath = os.path.join(lda_path, 'stem_groups_by_topic.csv')
    stem_groups_topic.to_csv(output_filepath, index=False)
    #write_list_to_text_file(topic_words_flat, 'topic_words_flat.txt')
    #topic_stem_groups.to_csv('topic_stem_groups.csv', index=False)


if __name__ == '__main__':
    main()