Here are the Anaconda ‘environment.yml’ specifications:
name: gensim_nltk_pandas dependencies: - boto=2.47.0=py36_0 - bz2file=0.98=py36_0 - gensim=2.2.0=np113py36_0 - libgfortran=3.0.0=1 - mkl=2017.0.3=0 - nltk=3.2.4=py36_0 - numpy=1.13.1=py36_0 - openssl=1.0.2l=0 - pandas=0.20.2=np113py36_0 - pip=9.0.1=py36_1 - python=3.6.2=0 - python-dateutil=2.6.0=py36_0 - pytz=2017.2=py36_0 - readline=6.2=2 - requests=2.14.2=py36_0 - scipy=0.19.1=np113py36_0 - setuptools=27.2.0=py36_0 - six=1.10.0=py36_0 - smart_open=1.5.3=py36_0 - sqlite=3.13.0=0 - tk=8.5.18=0 - wheel=0.29.0=py36_0 - xz=5.2.2=1 - zlib=1.2.8=3 - pip: - smart-open==1.5.3
Here is the code:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
#! /usr/bin/env python3 def get_sibling_directory_path(sibling_directory_name): ''' returns path for a specified folder that is in the same parent directory as the current working directory ''' import os current_path = os.getcwd() last_separator_position = current_path.rfind(os.sep) parent_directory_path = current_path[0:last_separator_position] sibling_directory_path = os.path.join(parent_directory_path, sibling_directory_name) return(sibling_directory_path) def get_dictionary_list(): ''' Retrieves list of all English dictionary words ''' from nltk.corpus import words word_list = words.words() return(word_list) def stem_words(word_list): ''' Stems words in 'word_list' and returns resulting list ''' from nltk.stem.lancaster import LancasterStemmer nltk_lancaster = LancasterStemmer stemmer = nltk_lancaster() stemmed = [stemmer.stem(w) for w in word_list] return(stemmed) def extract_topic_words(topic_words_n, lda_model): ''' Extracts the most probable words of each topic from 'lda_model' and their corresponding probabilities 'topic_words_n' - number of words to extract for each topic 'lda_model' - trained Gensim LdaModel object ''' topic_words = lda_model.print_topics(num_words=topic_words_n, num_topics=50) words_info = [e[1] for e in topic_words] words = [] numbers = [] for i in range(len(words_info)): quotes_idx = [p for p, q in enumerate(words_info[i]) if q == '\"'] q_start = quotes_idx[::2] q_end = quotes_idx[1::2] ws = [words_info[i][q_start[j]+1:q_end[j]] for j in range(len(q_start))] words.append(ws) plus_idx = [p for p, q in enumerate(words_info[i]) if q == '+'] asterisk_idx = [p for p, q in enumerate(words_info[i]) if q == '*'] ns = [float( words_info[i][0:asterisk_idx[0]] )] ns2 = [float( words_info[i][plus_idx[j]+2:asterisk_idx[j+1]] ) for j in range(len(plus_idx))] ns.extend(ns2) numbers.append(ns) return(words, numbers) def stem_groups_by_topic(topic_words, stem_groups, topic_probs): ''' ''' import pandas as pd rows = [] stem_list = stem_groups.iloc[:, 0].tolist() # .str.match() not returning only exact match for i in range(len(topic_words)): for j in range(len(topic_words[i])): if topic_words[i][j] in stem_list: idx = stem_list.index(topic_words[i][j]) non_stems = stem_groups.ix[idx, 1] else: non_stems = '' rows.append([i, topic_probs[i][j], topic_words[i][j], non_stems]) col_names = ['topic', 'probs', 'stem', 'non_stemmed'] stem_groups_topic = pd.DataFrame.from_records(rows, columns=col_names) return(stem_groups_topic) def write_list_to_text_file(a_list, text_file_name, overwrite_or_append = 'a'): ''' writes a list of strings to a text file appends by default; change to overwriting by setting to 'w' instead of 'a' ''' try: textfile = open(text_file_name, overwrite_or_append, encoding = 'utf-8') for element in a_list: textfile.write(element) textfile.write('\n') finally: textfile.close() def main(): ''' Gets most probable (stemmed) words for each topic in a Gensim LDA model and matches them to non-stemmed words from which they might have been derived ''' import os import pandas as pd from gensim.models.ldamodel import LdaModel # get English dictionary words and their stemmed versions word_list = get_dictionary_list() stemmed_list = stem_words(word_list) col0_name = 'stemmed' col1_name = 'words' word_df = pd.DataFrame({col1_name: word_list, col0_name: stemmed_list}) stem_groups = (word_df.groupby(col0_name, as_index=False)[[col1_name]] .aggregate(lambda x: list(x))) # get LDA model lda_folder = '45_lda_train_interval5' lda_path = get_sibling_directory_path(lda_folder) model_num = 'model102' lda_path = os.path.join(lda_path, model_num) lda_model_filename = 'lda_' + model_num lda_model_filepath = os.path.join(lda_path, lda_model_filename) lda = LdaModel.load(lda_model_filepath) # get most probable words for each topic in LDA model topic_words_n = 50 topic_words, topic_probs = extract_topic_words(topic_words_n, lda) #topic_words_flat = [e for sublist in topic_words for e in sublist] #topic_stem_groups = (stem_groups[stem_groups[col0_name] # .isin(topic_words_flat)]) stem_groups_topic = stem_groups_by_topic(topic_words, stem_groups, topic_probs) # save information output_filepath = os.path.join(lda_path, 'stem_reference.csv') word_df.to_csv(output_filepath, index=False) output_filepath = os.path.join(lda_path, 'stem_reference_grouped.csv') stem_groups.to_csv(output_filepath, index=False) output_filepath = os.path.join(lda_path, 'stem_groups_by_topic.csv') stem_groups_topic.to_csv(output_filepath, index=False) #write_list_to_text_file(topic_words_flat, 'topic_words_flat.txt') #topic_stem_groups.to_csv('topic_stem_groups.csv', index=False) if __name__ == '__main__': main() |