Here are the Anaconda ‘environment.yml’ specifications:
name: lda dependencies: - boto=2.47.0=py36_0 - bz2file=0.98=py36_0 - gensim=2.2.0=np113py36_0 - libgfortran=3.0.0=1 - mkl=2017.0.3=0 - nltk=3.2.4=py36_0 - numpy=1.13.1=py36_0 - openssl=1.0.2l=0 - pandas=0.20.2=np113py36_0 - pip=9.0.1=py36_1 - python=3.6.1=2 - python-dateutil=2.6.0=py36_0 - pytz=2017.2=py36_0 - readline=6.2=2 - requests=2.14.2=py36_0 - scipy=0.19.1=np113py36_0 - setuptools=27.2.0=py36_0 - six=1.10.0=py36_0 - smart_open=1.5.3=py36_0 - sqlite=3.13.0=0 - tk=8.5.18=0 - wheel=0.29.0=py36_0 - xz=5.2.2=1 - zlib=1.2.8=3 - pip: - html2text==2016.9.19 - smart-open==1.5.3
Here is the code:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
#! /usr/bin/env python3 def listdir_nohidden_nofolder(path): ''' returns list of nonhidden files in the directory of the specified path excludes folders 'os.listdir' includes hidden files, but this function excludes them ''' import os file_list = next(os.walk(path))[2] for e in file_list: if e.startswith('.'): file_list.remove(e) return(file_list) def read_text_file(text_filename, as_string=False): ''' reads each line in a text file as a list item and returns list by default if 'as_string' is 'True', reads entire text file as a single string ''' text_list = [] try: with open(text_filename) as text: if as_string: # reads text file as single string text_list = text.read().replace('\n', '') else: # reads each line of text file as item in a list for line in text: text_list.append(line.rstrip('\n')) text.close() return(text_list) except: return('There was an error while trying to read the file') def write_list_to_text_file(a_list, text_file_name, overwrite_or_append='a'): ''' writes a list to a text file with one list element per line in the file appends by default; change to overwriting by setting to 'w' instead of 'a' ''' try: textfile = open(text_file_name, overwrite_or_append, encoding='utf-8') for element in a_list: textfile.write('{e}'.format(e=element)) textfile.write('\n') finally: textfile.close() def get_log_topic_diff(log): ''' Identifies and returns topic difference calculations ('topic diff') in Gensim LDA model training log 'log' - a list where each element is a line from the Gensim LDA model training log ''' identifier = 'topic diff=' diff_messages = [e for e in log if identifier in e] topic_diffs = [e.rsplit(identifier, 1)[1] for e in diff_messages] topic_diffs = [float(e.split(',', 1)[0]) for e in topic_diffs] return(topic_diffs) def get_log_documents_converged(log): ''' Identifies numbers of documents that converge in Gensim LDA model training log and returns these numbers as proportions (e.g., 49 out of 100 documents converged is returned as 0.49) Note that most of the denominators (i.e., the size of the mini-batch in online training) are the same, but when the trainer reaches the end of a pass through a corpus, the last mini-batch size may be smaller 'log' - a list where each element is a line from the Gensim LDA model training log ''' identifier = 'documents converged within' converged_messages = [e for e in log if identifier in e] fraction = [e.rsplit(identifier, 1)[0] for e in converged_messages] debug_splitter = 'DEBUG :' fraction = [e.rsplit(debug_splitter, 1)[1].strip() for e in fraction] slash = '/' numerators = [int(e.split(slash)[0]) for e in fraction] denominators = [int(e.split(slash)[1]) for e in fraction] proportions = [n / d for n, d in zip(numerators, denominators)] return(proportions) def divide_log_by_model(log): ''' Divides log of Gensim LDA models training into multiple logs -- 1 for each LDA model in the original log Returns logs as a list of the logs 'log' - a list where each element is a line from the Gensim LDA model training log ''' identifier = "'init_dir_prior', 'created':" identifier_idx = [i for i, e in enumerate(log) if identifier in e] # identifier occurs in 1st 2 lines of every model run, so remove 2nd lines start_idx = [e for i, e in enumerate(identifier_idx) if i % 2 == 0] model_logs = [log[s:e] for s, e in zip(start_idx, start_idx[1:] + [None])] return(model_logs) def get_end_digits_from_filename(filename): ''' Returns any consecutive digits at the end of a filename, excluding the filename extension If no digits are present at the end of the filename, returns empty string ''' from re import search as re_search end_digits = re_search(r'\d+$', filename.rsplit('.', 1)[0]) if end_digits == None: return('') else: return(int(end_digits.group())) def process_log(log_filename): ''' Reads log of Gensim LDA models training from file, divides it into multiple logs if multiple LDA models are recorded in the log, extracts 'topic diff' and proportions of converged documents per mini-batch from the logs, and saves them to text files 'log_filename' - filename for log of Gensim LDA models training; the model ID number for the first model in the log is assumed to be at the end of the filename (excluding the file extension); subsequent model ID numbers are assumed to increment by 1 (e.g., model 1, model 2, model 3, etc.) ''' import os log_path = os.path.dirname(log_filename) log = read_text_file(log_filename) logs = divide_log_by_model(log) topic_diffs = [get_log_topic_diff(e) for e in logs] converged_proportions = [get_log_documents_converged(e) for e in logs] model_id = get_end_digits_from_filename(log_filename) for i in range(len(logs)): td_filename = 'topic_diffs' + str(model_id + i) + '.txt' td_filepath = os.path.join(log_path, td_filename) write_list_to_text_file(topic_diffs[i], td_filepath, 'w') cp_filename = 'converged_props' + str(model_id + i) + '.txt' cp_filepath = os.path.join(log_path, cp_filename) write_list_to_text_file(converged_proportions[i], cp_filepath, 'w') def process_logs(log_filenames): ''' Calls function 'process_log' once for each filename in 'log_filenames' 'log_filenames' - list of filenames ''' for i in range(len(log_filenames)): process_log(log_filenames[i]) def main(): ''' From each file named in 'log_filenames', reads log of Gensim LDA models training from file, divides it into multiple logs if multiple LDA models are recorded in the log, extracts 'topic diff' and proportions of converged documents per mini-batch from the logs, and saves them to text files ''' import os logs_folder = 'logs' path = os.path.join(os.getcwd(), logs_folder) filenames = listdir_nohidden_nofolder(path) log_stem = 'log_lda_model' log_stem_len = len(log_stem) log_filenames = [e for e in filenames if e[:log_stem_len] == log_stem] log_filepaths = [os.path.join(path, e) for e in log_filenames] process_logs(log_filepaths) if __name__ == '__main__': main() |