Model Training Log Extraction: Code

Here are the Anaconda ‘environment.yml’ specifications:

name: lda
dependencies:
- boto=2.47.0=py36_0
- bz2file=0.98=py36_0
- gensim=2.2.0=np113py36_0
- libgfortran=3.0.0=1
- mkl=2017.0.3=0
- nltk=3.2.4=py36_0
- numpy=1.13.1=py36_0
- openssl=1.0.2l=0
- pandas=0.20.2=np113py36_0
- pip=9.0.1=py36_1
- python=3.6.1=2
- python-dateutil=2.6.0=py36_0
- pytz=2017.2=py36_0
- readline=6.2=2
- requests=2.14.2=py36_0
- scipy=0.19.1=np113py36_0
- setuptools=27.2.0=py36_0
- six=1.10.0=py36_0
- smart_open=1.5.3=py36_0
- sqlite=3.13.0=0
- tk=8.5.18=0
- wheel=0.29.0=py36_0
- xz=5.2.2=1
- zlib=1.2.8=3
- pip:
- html2text==2016.9.19
- smart-open==1.5.3

Here is the code:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#! /usr/bin/env python3


def listdir_nohidden_nofolder(path):
    '''
    returns list of nonhidden files in the directory of the specified path
        excludes folders
    'os.listdir' includes hidden files, but this function excludes them
    '''

    import os

    file_list = next(os.walk(path))[2]

    for e in file_list:

        if e.startswith('.'):
            file_list.remove(e)

    return(file_list)


def read_text_file(text_filename, as_string=False):
    '''
    reads each line in a text file as a list item and returns list by default
    if 'as_string' is 'True', reads entire text file as a single string
    '''

    text_list = []

    try:
        with open(text_filename) as text:
            if as_string:
                # reads text file as single string
                text_list = text.read().replace('\n', '')
            else:
                # reads each line of text file as item in a list
                for line in text:
                    text_list.append(line.rstrip('\n'))
            text.close()
        return(text_list)

    except:
        return('There was an error while trying to read the file')


def write_list_to_text_file(a_list, text_file_name, overwrite_or_append='a'):
    '''
    writes a list to a text file with one list element per line in the file
    appends by default; change to overwriting by setting to 'w' instead of 'a'
    '''

    try:
        textfile = open(text_file_name, overwrite_or_append, encoding='utf-8')
        for element in a_list:
            textfile.write('{e}'.format(e=element))
            textfile.write('\n')

    finally:
        textfile.close()


def get_log_topic_diff(log):
    '''
    Identifies and returns topic difference calculations ('topic diff') in
        Gensim LDA model training log
    'log' - a list where each element is a line from the Gensim LDA model
        training log
    '''
    identifier = 'topic diff='
    diff_messages = [e for e in log if identifier in e]

    topic_diffs = [e.rsplit(identifier, 1)[1] for e in diff_messages]
    topic_diffs = [float(e.split(',', 1)[0]) for e in topic_diffs]

    return(topic_diffs)


def get_log_documents_converged(log):
    '''
    Identifies numbers of documents that converge in Gensim LDA model training
        log and returns these numbers as proportions (e.g., 49 out of 100
        documents converged is returned as 0.49)
    Note that most of the denominators (i.e., the size of the mini-batch in
        online training) are the same, but when the trainer reaches the end of
        a pass through a corpus, the last mini-batch size may be smaller
    'log' - a list where each element is a line from the Gensim LDA model
        training log
    '''

    identifier = 'documents converged within'
    converged_messages = [e for e in log if identifier in e]
    fraction = [e.rsplit(identifier, 1)[0] for e in converged_messages]

    debug_splitter = 'DEBUG :'
    fraction = [e.rsplit(debug_splitter, 1)[1].strip() for e in fraction]
    slash = '/'

    numerators = [int(e.split(slash)[0]) for e in fraction]
    denominators = [int(e.split(slash)[1]) for e in fraction]
    proportions = [n / d for n, d in zip(numerators, denominators)]

    return(proportions)


def divide_log_by_model(log):
    '''
    Divides log of Gensim LDA models training into multiple logs -- 1 for each
        LDA model in the original log
    Returns logs as a list of the logs
    'log' - a list where each element is a line from the Gensim LDA model
        training log
    '''
    identifier = "'init_dir_prior', 'created':"
    identifier_idx = [i for i, e in enumerate(log) if identifier in e]

    # identifier occurs in 1st 2 lines of every model run, so remove 2nd lines
    start_idx = [e for i, e in enumerate(identifier_idx) if i % 2 == 0]

    model_logs = [log[s:e] for s, e in zip(start_idx, start_idx[1:] + [None])]
    return(model_logs)


def get_end_digits_from_filename(filename):
    '''
    Returns any consecutive digits at the end of a filename, excluding the
        filename extension
    If no digits are present at the end of the filename, returns empty string
    '''
    from re import search as re_search

    end_digits = re_search(r'\d+$', filename.rsplit('.', 1)[0])

    if end_digits == None:
        return('')
    else:
        return(int(end_digits.group()))


def process_log(log_filename):
    '''
    Reads log of Gensim LDA models training from file, divides it into multiple
        logs if multiple LDA models are recorded in the log, extracts 'topic
        diff' and proportions of converged documents per mini-batch from the
        logs, and saves them to text files
    'log_filename' - filename for log of Gensim LDA models training; the model
        ID number for the first model in the log is assumed to be at the end of
        the filename (excluding the file extension); subsequent model ID
        numbers are assumed to increment by 1 (e.g., model 1, model 2,
        model 3, etc.)
    '''

    import os

    log_path = os.path.dirname(log_filename)
    log = read_text_file(log_filename)
    logs = divide_log_by_model(log)

    topic_diffs = [get_log_topic_diff(e) for e in logs]
    converged_proportions = [get_log_documents_converged(e) for e in logs]
    model_id = get_end_digits_from_filename(log_filename)

    for i in range(len(logs)):

        td_filename = 'topic_diffs' + str(model_id + i) + '.txt'
        td_filepath = os.path.join(log_path, td_filename)
        write_list_to_text_file(topic_diffs[i], td_filepath, 'w')

        cp_filename = 'converged_props' + str(model_id + i) + '.txt'
        cp_filepath = os.path.join(log_path, cp_filename)
        write_list_to_text_file(converged_proportions[i], cp_filepath, 'w')


def process_logs(log_filenames):
    '''
    Calls function 'process_log' once for each filename in 'log_filenames'
    'log_filenames' - list of filenames
    '''
    for i in range(len(log_filenames)):
        process_log(log_filenames[i])


def main():
    '''
    From each file named in 'log_filenames', reads log of Gensim LDA models
        training from file, divides it into multiple logs if multiple LDA models
        are recorded in the log, extracts 'topic diff' and proportions of
        converged documents per mini-batch from the logs, and saves them to text
        files
    '''

    import os

    logs_folder = 'logs'
    path = os.path.join(os.getcwd(), logs_folder)
    filenames = listdir_nohidden_nofolder(path)

    log_stem = 'log_lda_model'
    log_stem_len = len(log_stem)
    log_filenames = [e for e in filenames if e[:log_stem_len] == log_stem]
    log_filepaths = [os.path.join(path, e) for e in log_filenames]

    process_logs(log_filepaths)


if __name__ == '__main__':
    main()