Calculate Prominences

Here are the Anaconda ‘environment.yml’ specifications:

name: spell_check
dependencies:
- mkl=2017.0.1=0
- numpy=1.13.0=py36_0
- openssl=1.0.2l=0
- pandas=0.20.2=np113py36_0
- pip=9.0.1=py36_1
- python=3.6.1=2
- python-dateutil=2.6.0=py36_0
- pytz=2017.2=py36_0
- readline=6.2=2
- setuptools=27.2.0=py36_0
- six=1.10.0=py36_0
- sqlite=3.13.0=0
- tk=8.5.18=0
- wheel=0.29.0=py36_0
- xz=5.2.2=1
- zlib=1.2.8=3
- pip:
- pyenchant==1.6.8

Here is the code:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
#! /usr/bin/env python3


def get_sibling_directory_path(sibling_directory_name):
    '''
    returns path for a specified folder that is in the same parent directory as
        the current working directory
    '''

    import os

    current_path = os.getcwd()
    last_separator_position = current_path.rfind(os.sep)
    parent_directory_path = current_path[0:last_separator_position]
    sibling_directory_path = os.path.join(parent_directory_path,
                                        sibling_directory_name)
    return(sibling_directory_path)


def print_intermittent_status_message_in_loop(iteration, every_xth_iteration,
                                            total_iterations):
    '''
    Prints message updating loop's progress for user
    '''

    if iteration % every_xth_iteration == 0:
        print('Processing file {0} of {1}, which is {2:.0f}%'
            .format(iteration + 1, total_iterations,
                    100 * (iteration + 1) / total_iterations))


def read_table(table_filepath, column_of_lists):
    '''
    reads table from 'csv' file
    each item in column 'column_of_lists' is read as a list; as currently
        written, the function can read only 1 column as a list
    '''

    import pandas as pd
    from ast import literal_eval

    # '^' used as separator because it does not appear in any text descriptions
    table = pd.read_csv(table_filepath, sep='^',
                        converters={column_of_lists[0]: literal_eval,
                                    column_of_lists[1]: literal_eval,
                                    column_of_lists[2]: literal_eval})
    return(table)


def read_text_file(text_filename, as_string=False):
    '''
    reads each line in a text file as a list item and returns list by default
    if 'as_string' is 'True', reads entire text file as a single string
    '''

    text_list = []

    try:
        with open(text_filename) as text:
            if as_string:
                # reads text file as single string
                text_list = text.read().replace('\n', '')
            else:
                # reads each line of text file as item in a list
                for line in text:
                    text_list.append(line.rstrip('\n'))
            text.close()
        return(text_list)

    except:
        return('There was an error while trying to read the file')


def write_list_to_text_file(a_list, text_file_name, overwrite_or_append='a'):
    '''
    writes a list of strings to a text file
    appends by default; change to overwriting by setting to 'w' instead of 'a'
    '''

    try:
        textfile = open(text_file_name, overwrite_or_append, encoding='utf-8')
        for element in a_list:
            textfile.write(element)
            textfile.write('\n')

    finally:
        textfile.close()


def add_possessives_to_word_list(word_list):
    '''
    Adds an apostrophe and 's' to each element in 'word_list', then adds each of
        these possessives to the original 'word_list' immediately after the
        corresponding word
    '''

    possessives = [s + '\'s' for s in word_list]
    words_with_poss = [None] * (len(word_list) * 2)
    words_with_poss[::2] = word_list
    words_with_poss[1::2] = possessives

    return(words_with_poss)


def tally_word_counts_in_column(table_column, word_list, output_len):
    '''
    Inputs:  'table_column' is a Pandas DataSeries where each element is a
        string (or list of strings) to be searched; 'word_list' is a list of
        words to search for; 'output_len' is the length of the output table
        'tallies'
    Outputs:  a table/DataFrame with each element in 'word_list' as a column
        name, each row corresponding to each element in 'table_column', and
        values of '0' or '1' indicating whether that column's 'word_list'
        element appeared in that row's 'table_column' element; '0' is 'No' and
        '1' is 'Yes'
    'word_list' elements that are single words must match a token from the
        'table_column' string to be considered a match; this prevents words from
        matching when they are only sub-words in the string (e.g., so that
        searching for 'hat' in 'that' does not yield a match);
    'word_list' elements that are multiple words or hyphenated are searched for
        in the string itself (which can produce the sub-word problem described
        above)
    '''

    import pandas as pd
    from numpy import arange
    from enchant.tokenize import get_tokenizer

    tallies = pd.DataFrame(0, index=arange(output_len), columns=word_list)
    tokenizer = get_tokenizer('en_US')
    message_interval = 1000

    for i, text in table_column.iteritems():

        print_intermittent_status_message_in_loop(i, message_interval,
                                                output_len)

        if isinstance(text, list):  # if text stored in list instead of string
            if not text[0] and len(text) == 1:
                continue            # skips empty lists
            text = ' '.join(text)

        tokens = [w[0].lower() for w in tokenizer(text)]

        for j in range(len(word_list)):

            if (' ' in word_list[j]) or ('-' in word_list[j]):
                if word_list[j] in text:
                    tallies.iloc[i, j] = 1
            else:
                if word_list[j] in tokens:
                    tallies.iloc[i, j] = 1

    return(tallies)


def combine_column_pairs(table):
    '''
    Combines each adjacent pair of columns in a dataframe with Boolean 'or' and
        returns the new table (with half the number of columns as the original
        table)
    '''

    import numpy as np
    import pandas as pd

    columns = table.columns.values.tolist()
    columns = columns[0::2]     # excludes every other element
    combined_table = pd.DataFrame(0, index=np.arange(len(table)),
                                columns=columns)

    for i in range(len(columns)):
        combined_table.iloc[:, i] = table.iloc[:, i*2] | table.iloc[:, i*2+1]

    return(combined_table)


def count_characters(expanded_table, characters):
    '''
    Performs a series of counts; each count tallies when words from 'characters'
        appear in a specified column from 'expanded_table'
    Each count is returned as a table/Pandas DataFrame; each word in
        'characters' has its own column; each element from the column in
        'expanded_table' has its own row; if a word appears in a column element,
        the value is '1', otherwise it's '0'
    Output:  Each count table is stored as an element in the list 'counts'
    '''

    columns_to_count = [expanded_table.ix[:, 'text_spell_corrected'],         # for an overall count
                        expanded_table.ix[:, 'text_nontalk'],                 # count in text outside double-quotes
                        expanded_table.ix[:, 'text_talk'],                    # count in text inside double-quotes
                        expanded_table.ix[expanded_table['odd_quotes'] == 1,  # counts text with odd number of double-quotes
                                        'text_spell_corrected'],
                        expanded_table.ix[expanded_table['no_quotes'] == 1,   # counts text with no double-quotes
                                        'text_spell_corrected']]
    counts = []

    for i in range(len(columns_to_count)):
        temp = tally_word_counts_in_column(columns_to_count[i], characters,
                                        len(expanded_table))
        counts.append(combine_column_pairs(temp))

    return(counts)


def counts_summary_table(counts):
    '''
    Returns table of counts for each searched-for word (usually characters),
        with one word per row; columns show different counts based on which
        text was counted
    '''

    import pandas as pd

    a = counts[0].sum()     # overall count, from 'text_spell_corrected' column
    b = counts[1].sum()     # 'nontalk' count from 'text_nontalk' column
    c = counts[2].sum()     # 'talk' count from 'text_talk' column
    d = counts[3].sum()     # 'odd_quotes' count from text panels with an odd number of double-quotes
    e = counts[4].sum()     # 'no_quotes' count from text panels with no double-quotes

    f = (counts[1] | counts[2] | counts[3] | counts[4]).sum()   # sums everything except 'overall'; should be identical to 'overall'

    sums = pd.DataFrame(data=[a, b + e, c, d, f]).transpose()   # b + e:  'no_quotes' is classified as 'nontalk'
    sums.columns = ['overall', 'nontalk', 'talk', 'oddq', 'sum']
    sums['error'] = ((sums['sum'] / sums['overall']) * 100) - 100  # verify that 'overall' matches 'sum', errors = 0

    return(sums)


def characters_merge_dict():
    '''
    Creates dictionary of keys naming characters and values of the names that
        the keys should be reclassified under; this allows the data associated
        with the keys to be associated with the value
    '''

    chars = {'charlie'                      : 'charlie brown',
            'brown'                        : 'charlie brown',
            'charles'                      : 'charlie brown',
            'lucy van pelt'                : 'lucy',
            'linus van pelt'               : 'linus',
            'sally brown'                  : 'sally',
            'peggy-jean'                   : 'peggy jean',
            'pig pen'                      : 'pig-pen',
            'pigpen'                       : 'pig-pen',
            'beagle scout'                 : 'beaglescout',
            'beagle-scout'                 : 'beaglescout',
            'beagle scouts'                : 'beaglescouts',
            'beagle-scouts'                : 'beaglescouts',
            'shlabotnik'                   : 'joe shlabotnik',
            'molly'                        : 'molly volley',
            'charlotte'                    : 'charlotte braun',
            'braun'                        : 'charlotte braun',
            'crybaby'                      : 'crybaby boobie',
            'boobie'                       : 'crybaby boobie',
            'tapioca'                      : 'tapioca pudding',
            'red haired girl'              : 'red-haired girl',
            'sweetstory'                   : 'helen sweetstory',
            'pig-tailed girl'              : 'pigtailed girl',
            'pig tailed girl'              : 'pigtailed girl',
            'kindergarten friend'          : 'pigtailed girl',
            'school building'              : 'the school building',
            'school thinks'                : 'the school building',
            'world-famous'                 : 'world famous',
            'world-famous grocery clerk'   : 'world famous grocery clerk',
            'world-famous golf'            : 'world famous golf',
            'legionnaires'                 : 'legionnaire',
            'kite-eating'                  : 'kite-eating tree',
            'kite eating tree'             : 'kite-eating tree'}

    return(chars)


def merge_two_columns(count_table, col1_name, col2_name, merged_name):
    '''
    Replaces two columns in a Pandas DataFrame with new column created by a
        Boolean disjunction ('or') of the original two columns
    '''

    col1 = count_table[col1_name]
    col2 = count_table[col2_name]

    count_table.drop(col1_name, axis=1, inplace=True)
    count_table.drop(col2_name, axis=1, inplace=True)
    count_table[merged_name] = col1 | col2

    return(count_table)


def find_two_columns_to_merge(column_names, merge_dict):
    '''
    searches in 'column_names' for the names of two columns to be merged
    a column should be merged if it's a key in 'merge_dict'; the key's value
        is the new name for the merged column
    if the new name is itself already a column, the column should be merged
        with this existing 'new name' column
    if the new name is not already a column, then search for a 2nd column that
        has the same new name listed as a value in 'merge_dict' as the 1st
        column did; if such a 2nd column is found, merge the 1st and 2nd columns
        under the shared new name
    returns the names of the two columns to be merged and the new name for the
        merged column
    '''

    for i in range(len(column_names)):

        col1_name = column_names[i]

        if col1_name not in merge_dict:
            continue

        # find a column in 'merge_dict' to be merged; identify its new, merged name
        col1_new_name = merge_dict[col1_name]

        # if the new name is already a column
        if col1_new_name in column_names:

            # merge the two columns together under the new name
            return(col1_name, col1_new_name, col1_new_name)

        # if the new name is not already a column
        else:

            # find a 2nd column...
            for j in range(i + 1, len(column_names)):

                col2_name = column_names[j]

                # ...that should be merged...
                if col2_name in merge_dict:

                    col2_new_name = merge_dict[col2_name]

                    # ...and has same new name as the 1st column
                    if col1_new_name == col2_new_name:

                        # merge the two columns together under the shared new name
                        return(col1_name, col2_name, col1_new_name)

                    else:
                        continue

    # if no columns are found, return empty strings
    return('', '', '')


def merge_character_counts_into_single_columns(count_table):
    '''
    Inputs:  'count_table' is a table/Pandas DataFrame that tallies whether a
        word or phrase (naming characters) appeared in items of text; each word
        has its own column; each row corresponds to an item of text; if a word
        appeared in a text item, the corresponding cell in the 'count_table'
        is '1', otherwise it's '0'
    Merges columns in 'count_table' as specified in the dictionary
        'characters_merge', which names the columns to be merged (keys) and the
        names of the new, merged columns (values)
    The original columns that are merged are deleted from the 'count_table',
        while the new, merged columns are added to the table
    This allows characters with alternate spellings or names (e.g., 'Pig-Pen'
        and 'Pig Pen' to be reclassified under a single column
    Output:  'count_table' with the new, merged columns
    '''

    characters_merge = characters_merge_dict()
    i = 0       # ensures termination of loop in case of unanticipated error

    while i < count_table.shape[1]:

        col1_name, col2_name, new_name = find_two_columns_to_merge(
            count_table.columns, characters_merge)

        if not col1_name:
            break

        count_table = merge_two_columns(count_table, col1_name, col2_name,
                                        new_name)
        i += 1

    return(count_table)


def merge_characters_multiple_tables(counts):
    '''
    Loops function 'merge_character_counts_into_single_columns' for each
        count table in 'counts'
    Description of 'merge_character_counts_into_single_columns':
        Inputs:  'count_table' is a table/Pandas DataFrame that tallies whether a
            word or phrase (naming characters) appeared in items of text; each word
            has its own column; each row corresponds to an item of text; if a word
            appeared in a text item, the corresponding cell in the 'count_table'
            is '1', otherwise it's '0'
        Merges columns in 'count_table' as specified in the dictionary
            'characters_merge', which names the columns to be merged (keys) and the
            names of the new, merged columns (values)
        The original columns that are merged are deleted from the 'count_table',
            while the new, merged columns are added to the table
        This allows characters with alternate spellings or names (e.g., 'Pig-Pen'
            and 'Pig Pen' to be reclassified under a single column
    '''

    for i in range(len(counts)):
        counts[i] = merge_character_counts_into_single_columns(counts[i])

    return(counts)


def adjust_patty_counts_in_table(dates_column, count_table, pep_patty_dates):
    '''
    Adjusts counts for Patty and Peppermint Patty
    For 5 dates, Peppermint Patty is referred to only as 'Patty', so 'patty'
        counts are reassigned to 'peppermint patty' for these dates
    Peppermint Patty and Patty never appear in a comic together, so all 'patty'
        counts on Peppermint Patty appearance dates are set to '0', indicating
        that she didn't appear
    '''

    # 5 dates at end are when Peppermint Patty is referred to only as 'Patty'
    add_pep_dates = pep_patty_dates[-5:]

    # for these dates, assign 'patty' counts to 'peppermint patty'
    idx = dates_column[dates_column.isin(add_pep_dates)].index.values
    count_table.ix[idx, 'peppermint patty'] = count_table.ix[idx, 'patty']

    # for dates when Peppermint Patty appears, remove any appearances for 'Patty'
    # (they never appear in a comic together)
    idx = dates_column[dates_column.isin(pep_patty_dates)].index.values
    count_table.ix[idx, 'patty'] = 0

    return(count_table)


def adjust_patty_counts_multiple_tables(dates_column, counts, pep_patty_dates):
    '''
    Loops function 'adjust_patty_counts_in_table' for each
        count table in 'counts'
    Description of 'adjust_patty_counts_in_table':
        Adjusts counts for Patty and Peppermint Patty
        For 5 dates, Peppermint Patty is referred to only as 'Patty', so 'patty'
            counts are reassigned to 'peppermint patty' for these dates
        Peppermint Patty and Patty never appear in a comic together, so all 'patty'
            counts on Peppermint Patty appearance dates are set to '0', indicating
            that she didn't appear
    '''

    for i in range(len(counts)):
        counts[i] = adjust_patty_counts_in_table(dates_column, counts[i],
                                                pep_patty_dates)

    return(counts)


def create_appearances_dict():
    '''
    Creates dictionary of first and last appearances (or mentions, sometimes)
        for several characters in the Peanuts comic strip
    '''

    dates = {'sally'                : ['1959-08-23', '2000-02-06'],
            'rerun'                : ['1973-03-26', '2000-01-30'],
            'woodstock'            : ['1966-03-04', '2000-01-16'],
            #'peppermint'          : ['1966-08-22', '2000-02-13'],
            'violet'               : ['1951-02-07', '1997-11-27'],
            'marcie'               : ['1971-07-20', '2000-01-02'],
            'spike'                : ['1975-08-13', '1999-12-21'],
            'franklin'             : ['1968-07-31', '1995-11-05'],
            'andy'                 : ['1994-02-14', '1999-09-27'],
            'olaf'                 : ['1989-01-16', '1999-09-27'],
            'frieda'               : ['1961-03-06', '1985-11-22'],
            'eudora'               : ['1978-06-13', '1987-06-13'],
            'truffles'             : ['1975-03-31', '1977-01-29'],
            'peggy jean'           : ['1990-07-23', '1999-07-11'],
            'pig pen'              : ['1954-07-13', '1999-09-08'],
            'othmar'               : ['1959-10-06', '1973-09-11'],
            'roy'                  : ['1965-06-11', '1984-05-27'],
            'cormac'               : ['1992-07-17', '2000-02-06'],     # don't know last appearance
            'thibault'             : ['1970-06-04', '1973-08-04'],
            'sophie'               : ['1968-06-18', '1987-08-19'],
            'poochie'              : ['1972-12-17', '1973-01-07'],
            'naomi'                : ['1998-10-01', '2000-02-06'],     # don't know last appearance
            'maynard'              : ['1986-07-21', '1986-07-30'],
            'lydia'                : ['1986-06-09', '1999-03-23'],
            'lila'                 : ['1968-02-17', '1968-08-31'],
            'larry'                : ['1991-05-28', '1991-12-19'],
            'royanne'              : ['1993-04-01', '1994-03-12'],
            'harold'               : ['1983-12-16', '1984-05-20'],
            'benny'                : ['1982-04-15', '1982-05-01'],
            'molly volley'         : ['1977-05-09', '1991-01-01'],     # last appearance in 1990
            'charlotte braun'      : ['1954-11-30', '1955-02-01'],
            'crybaby boobie'       : ['1978-07-03', '1997-03-10'],
            'tapioca pudding'      : ['1986-09-04', '1986-12-01'],
            'helen sweetstory'     : ['1971-04-09', '1972-11-30'],     # last mention Nov 1972
            'the school building'  : ['1974-08-31', '1979-12-31'],     # dies and rebuilt in 1976; unsure of last appearance
            'ethan'                : ['1993-07-14', '1993-07-15'],
            'floyd'                : ['1976-07-20', '1976-08-06'],
            'shirley'              : ['1968-06-18', '1987-07-20'],
            'emily'                : ['1995-02-11', '1999-12-31'],     # last appearance in 1999
            'belle'                : ['1976-06-22', '1981-05-11'],
            'faron'                : ['1961-05-23', '1961-11-20'],
            'harriet'              : ['1980-05-12', '2000-02-06'],     # don't know last appearance
            'bill'                 : ['1978-03-27', '2000-02-06'],     # don't know last appearance
            'conrad'               : ['1978-03-27', '2000-02-06'],     # don't know last appearance
            'olivier'              : ['1978-03-27', '2000-02-06'],     # don't know last appearance
            'raymond'              : ['1988-10-13', '2000-02-06'],     # don't know last appearance
            'fred'                 : ['1990-04-03', '1993-05-13'],
            'wilson'               : ['1984-12-02', '1998-06-06']}

    return(dates)


def adjust_counts_by_appearance_dates_in_table(dates_column, count_table):
    '''
    Adjusts counts so that characters can't appear before their debut dates or
        after their final dates in the comic strip
    Characters might erroneously appear outside their appearance dates because
        of typoes or non-specific uses of their names (e.g., 'sally', 'rerun',
        'violet', and 'spike' have meanings beyond the characters' names
    '''

    dates = create_appearances_dict()
    column_names = count_table.columns

    for i in range(len(column_names)):

        if column_names[i] in dates:
            start_end_dates = dates[column_names[i]]
            start_idx = min(dates_column[dates_column ==
                                        start_end_dates[0]].index.values)
            end_idx = max(dates_column[dates_column ==
                                    start_end_dates[1]].index.values)
            count_table.iloc[:start_idx, i] = 0
            count_table.iloc[end_idx+1:, i] = 0

    return(count_table)


def adjust_counts_by_appearance_dates_multiple_tables(dates_column, counts):
    '''
    Loops function 'adjust_counts_in_table_by_appearance_dates' for each
        count table in 'counts'
    Description of 'adjust_counts_in_table_by_appearance_dates':
        Adjusts counts so that characters can't appear before their debut dates or
            after their final dates in the comic strip
        Characters might erroneously appear outside their appearance dates because
            of typoes or non-specific uses of their names (e.g., 'sally', 'rerun',
            'violet', and 'spike' have meanings beyond the characters' names
    '''

    for i in range(len(counts)):
        counts[i] = adjust_counts_by_appearance_dates_in_table(dates_column,
                                                            counts[i])

    return(counts)


def correct_misidentified_characters_in_table(dates_column, count_table,
                                            misidentifications):
    '''
    Switches characters' counts on dates when a character was misidentified
    'dates_column' - Pandas Data Series with dates/filenames
    'count_table' - Pandas DataFrame/table that tallies whether a word or phrase
        (naming characters) appeared in items of text; each word has its own
        column; each row corresponds to an item of text; if a word appeared in a
        text item, the corresponding cell in the 'count_table' is '1', otherwise
        it's '0'
    'misidentifications' - list of lists, each of which contains 3 items:
        the date/filename of the comic in which a character was misidentified,
        the incorrect character that appears in the text description of the
        comic, and the correct character who actually appears in the comic
    Returns corrected 'count_table'
    '''
    for i in range(len(misidentifications)):

        idx = dates_column[dates_column == misidentifications[i][0]].index.values
        temp = count_table.ix[idx, misidentifications[i][2]]
        count_table.ix[idx, misidentifications[i][2]] = count_table.ix[idx,
                                                        misidentifications[i][1]]
        count_table.ix[idx, misidentifications[i][1]] = temp

    return(count_table)


def correct_misidentified_characters_multiple_tables(dates_column, counts,
                                                    misidentifications):
    '''
    Loops function 'correct_misidentified_characters_in_table' for each
        count table in 'counts'
    Description of 'correct_misidentified_characters_in_table':
        Switches characters' counts on dates when a character was misidentified
        'dates_column' - Pandas Data Series with dates/filenames
        'count_table' - Pandas DataFrame/table that tallies whether a word or phrase
            (naming characters) appeared in items of text; each word has its own
            column; each row corresponds to an item of text; if a word appeared in a
            text item, the corresponding cell in the 'count_table' is '1', otherwise
            it's '0'
        'misidentifications' - list of lists, each of which contains 3 items:
            the date/filename of the comic in which a character was misidentified,
            the incorrect character that appears in the text description of the
            comic, and the correct character who actually appears in the comic
        Returns corrected 'count_table'
    '''

    for i in range(len(counts)):
        counts[i] = correct_misidentified_characters_in_table(dates_column,
                                                counts[i], misidentifications)

    return(counts)


def snoopy_and_personas_in_table(count_table):
    '''
    Adds column to 'count_table' that counts Snoopy and his personas, in case
        some text descriptions refer to him by the persona and not by 'Snoopy',
        which would undercount Snoopy's appearances
    '''

    count_table['snoopy and personas'] = (count_table['snoopy'] |
                                        count_table['world famous'] |
                                        count_table['joe cool'] |
                                        count_table['joe motocross'] |
                                        count_table['joe blackjack'] |
                                        count_table['joe sandbagger'] |
                                        count_table['joe grunge'] |
                                        count_table['flying ace'] |
                                        count_table['literary ace'] |
                                        count_table['masked marvel'] |
                                        count_table['easter beagle'] |
                                        count_table['lone beagle'] |
                                        count_table['revolutionary war patriot'] |
                                        count_table['legionnaire'])

    return(count_table)


def snoopy_and_personas_multiple_tables(counts):
    '''
    Loops function 'snoopy_and_personas_in_table' for each
        count table in 'counts'
    Description of 'snoopy_and_personas_in_table':
        Adds column to 'count_table' that counts Snoopy and his personas, in case
            some text descriptions refer to him by the persona and not by 'Snoopy',
            which would undercount Snoopy's appearances
    '''

    for i in range(len(counts)):
        counts[i] = snoopy_and_personas_in_table(counts[i])

    return(counts)


def save_tables_to_csv(list_of_tables, list_of_filenames):
    '''
    Saves Pandas DataFrames/tables in a list to 'csv' files
    '''

    for i in range(len(list_of_tables)):
        list_of_tables[i].to_csv(list_of_filenames[i] + '.csv',
                                sep=',', index=True)


def counts_by_comic_multiple_tables(dates_column, num_panels_column,
                                    panels_with_characters, counts):
    '''
    Groups panels by comic strip date, so that each row represents a comic (with
        1 or more panels) instead of a panel
    'dates_column' - Pandas Data Series with dates/filenames
    'num_panels_column' - Pandas Data Series with number of panels in that comic
    'count_table' - Pandas DataFrame/table that tallies whether a word or phrase
        (naming characters) appeared in items of text (each item corresponds to
        a panel); each word has its own column; each row corresponds to an item
        of text; if a word appeared in a text item, the corresponding cell in
        the 'count_table' is '1', otherwise it's '0'
    'panel_counts_by_comic' - counts are summed over panels per comic, i.e., in
        how many panels did this character appear in the comic?
    'counts_by_comic' - panel counts per comic are transformed into Boolean; did
        the character appear in this comic, yes or no?
    'proportions_by_comic' - summed counts per comic are divided by the number
        of panels in that comic
    'props_w_chars_by_comic' - summed counts per comic are divided by the number
        of panels that include a mention of a character in that comic; e.g., if
        the text description of a 4-panel comic mentions 'Snoopy' only once in
        the 1st panel and thereafter refers to him as 'he', then
        'proportions_by_comic' counts Snoopy's proportion as 1 / 4 = 0.25,
        whereas 'props_w_chars_by_comic' counts Snoopy at 1 / 1 = 1
    '''

    num_panels_by_comic = num_panels_column.groupby(dates_column).median()
    num_panels_w_chars_by_comic = panels_with_characters.groupby(dates_column).sum()
    panel_counts_by_comic = []
    counts_by_comic = []
    proportions_by_comic = []
    props_w_chars_by_comic = []

    for i in range(len(counts)):
        panel_counts_by_comic.append(counts[i].groupby(dates_column).sum())
        counts_by_comic.append(panel_counts_by_comic[i] > 0)
        proportions_by_comic.append(panel_counts_by_comic[i].divide(
            num_panels_by_comic, axis='index'))
        props_w_chars_by_comic.append(panel_counts_by_comic[i].divide(
            num_panels_w_chars_by_comic, axis='index'))

    return(panel_counts_by_comic, counts_by_comic, proportions_by_comic,
        props_w_chars_by_comic)


def main():
    '''
    Searches for words or phrases (mostly character names) in text descriptions
        of Peanuts and marks whether each word or phrase appears in each
        text description
    Adjusts counts in several ways:
        1) combines word/phrases that refer to the same character (e.g.,
            'Pig-Pen' and 'Pig Pen')
        2) corrects counts to distinguish between Patty and Peppermint Patty
        3) removes any appearances that occur before a character's first
            appearance in the comic strip or after the character's last
            appearance
        4) corrects for some cases where characters are misidentified by the
            text description of the comic (based only on happenstance
            observations, not on a systematic or comprehensive search for such
            misidentifications)
        5) provides combined count data for Snoopy and many of his personas, in
            case Snoopy is sometimes referred to by only a persona in a text
            description of a comic
    Provides count data for each panel (Boolean:  whether the word/phrase
        appears or not) in a comic and for each comic (sum of appearances for
        panels of that comic); also provides per-comic data as proportions
        (i.e., in what proportion of panels did the word/phrase appear for that
        comic?)
    Also provides summary tables of per-panel and per-comic count data
    '''

    import os

    table_folder = '07_separate_talk'
    table_file = 'expanded_table.csv'
    source_path = get_sibling_directory_path(table_folder)
    table_filepath = os.path.join(source_path, table_file)

    text_col_names = ['comics_speakers', 'text_nontalk', 'text_talk']
    expanded_table = read_table(table_filepath, text_col_names)
    expanded_table.to_csv('expanded_table.csv', sep='^', index=False)
    dates_column = expanded_table.ix[:, 'filename']         # convenient for later use

    # reading 'expanded_table' from the 'csv' file produces 11 NaN values in
    # each of the columns 'text_by_panels' and 'text_spell_corrected'; manual
    # inspection of each case showed that no text was being lost; deleting the
    # rows may disrupt some subsequent loops, and the NaNs won't affect the
    # character counts
    nan_n = expanded_table.ix[:, 'text_by_panels'].isnull().sum()
    panels_n = len(expanded_table) - nan_n
    #expanded_table.ix[:, 'text_by_panels'].fillna('', inplace=True)
    expanded_table.ix[:, 'text_spell_corrected'].fillna('', inplace=True)

    # names of characters to be counted
    # NOTE:  any non_character words included in the 'character_file' could
    # adversely affect the proportions in 'props_w_chars_by_comic'
    # 'characters_only.txt' includes characters only and is used for calculating
    # proportions in 'props_w_chars_by_comic'
    character_file = 'characters_only.txt'
    characters_only = read_text_file(character_file)
    character_file = 'characters_and_more.txt'
    characters = read_text_file(character_file)
    characters = [s.lower() for s in characters]
    characters = add_possessives_to_word_list(characters)   # tokenizer includes possessives

    # count the characters' appearances/mentions
    counts = count_characters(expanded_table, characters)
    counts_summary_1 = counts_summary_table(counts)
    counts_summary_1.to_csv('counts_summary_01.csv', sep=',', index=True)

    # merge counts for multiple search terms into single column with single name
    # e.g., counts for 'Pig-Pen' and 'Pig Pen' are merged together under 'Pig-Pen'
    counts = merge_characters_multiple_tables(counts)
    counts_summary_2 = counts_summary_table(counts)
    counts_summary_2.to_csv('counts_summary_02.csv', sep=',', index=True)

    # correct counts to distinguish between Patty and Peppermint Patty
    patty_folder = '06_character_talk'
    patty_file = 'peppermint_patty_dates.txt'
    patty_path = get_sibling_directory_path(patty_folder)
    patty_filepath = os.path.join(patty_path, patty_file)
    pep_patty_dates = read_text_file(patty_filepath)
    write_list_to_text_file(pep_patty_dates, patty_file, 'w')

    counts = adjust_patty_counts_multiple_tables(dates_column, counts,
                                                pep_patty_dates)
    counts_summary_3 = counts_summary_table(counts)
    counts_summary_3.to_csv('counts_summary_03.csv', sep=',', index=True)

    # remove appearances/mentions that occur outside a character's appearance dates
    counts = adjust_counts_by_appearance_dates_multiple_tables(dates_column,
                                                            counts)
    counts_summary_4 = counts_summary_table(counts)
    counts_summary_4.to_csv('counts_summary_04.csv', sep=',', index=True)
    counts_summary_5 = counts_summary_3 - counts_summary_4
    counts_summary_5.to_csv('counts_summary_05.csv', sep=',', index=True)

    # characters misidentified in text descriptions:  date, incorrect character,
    # correct character
    misidentifications = [['1999-09-19', 'linus', 'rerun'],
                        ['1999-11-23', 'linus', 'rerun'],
                        ['1999-12-05', 'rerun', 'linus']]
    # correct counts for misidentified characters
    counts = correct_misidentified_characters_multiple_tables(dates_column,
                                                    counts, misidentifications)
    counts_summary_6 = counts_summary_table(counts)
    counts_summary_6.to_csv('counts_summary_06.csv', sep=',', index=True)

    # some text descriptions might refer to Snoopy only by one of his personas,
    # which would produce and undercount of Snoopy's appearances under 'snoopy';
    # so, add a new column that includes appearances/mentions of Snoopy and
    # his major personas combined
    counts = snoopy_and_personas_multiple_tables(counts)
    counts_summary_7 = counts_summary_table(counts)
    counts_summary_7.to_csv('counts_summary_07.csv', sep=',', index=True)

    # calculate counts per comic, instead of per panel
    num_panels_column = expanded_table.ix[:, 'num_panels']
    panels_with_characters = counts[0][characters_only].any(axis=1)
    (panel_counts_by_comic, counts_by_comic,
    proportions_by_comic, props_w_chars_by_comic) = (
        counts_by_comic_multiple_tables(dates_column, num_panels_column,
                                        panels_with_characters, counts))
    panel_counts_by_comic_summary = counts_summary_table(panel_counts_by_comic)
    #panel_counts_by_comic_summary.to_csv('panel_counts_by_comic_summary.csv',
    #                                     sep=',', index=True)
    counts_by_comic_summary = counts_summary_table(counts_by_comic)
    counts_by_comic_summary.to_csv('counts_by_comic_summary.csv',
                                sep=',', index=True)

    # save counts tables
    count_types = ['1_overall', '2_nontalk', '3_talk',
                '4_odd_quotes', '5_no_quotes']
    counts_filenames = ['counts_by_panel_' + e for e in count_types]
    counts_by_comic_filenames = ['counts_by_comic_' + e for e in count_types]
    proportions_by_comic_filenames = ['proportions_by_comic_' + e
                                    for e in count_types]
    props_w_chars_by_comic_filenames = ['proportions_with_chars_by_comic_' + e
                                        for e in count_types]

    save_tables_to_csv(counts, counts_filenames)
    save_tables_to_csv(counts_by_comic, counts_by_comic_filenames)
    #save_tables_to_csv(proportions_by_comic, proportions_by_comic_filenames)
    save_tables_to_csv(props_w_chars_by_comic, props_w_chars_by_comic_filenames)


if __name__ == '__main__':
    main()