Divide Text

Here are the Anaconda ‘environment.yml’ specifications:

name: pandas
dependencies:
- mkl=2017.0.1=0
- numpy=1.13.0=py36_0
- openssl=1.0.2l=0
- pandas=0.20.2=np113py36_0
- pip=9.0.1=py36_1
- python=3.6.1=2
- python-dateutil=2.6.0=py36_0
- pytz=2017.2=py36_0
- readline=6.2=2
- setuptools=27.2.0=py36_0
- six=1.10.0=py36_0
- sqlite=3.13.0=0
- tk=8.5.18=0
- wheel=0.29.0=py36_0
- xz=5.2.2=1
- zlib=1.2.8=3

Here is the code:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#! /usr/bin/env python3


def get_sibling_directory_path(sibling_directory_name):
    '''
    returns path for a specified folder that is in the same parent directory as
        the current working directory
    '''

    import os

    current_path = os.getcwd()
    last_separator_position = current_path.rfind(os.sep)
    parent_directory_path = current_path[0:last_separator_position]
    sibling_directory_path = os.path.join(parent_directory_path,
                                          sibling_directory_name)
    return(sibling_directory_path)


def split_text_by_substring(text, splitter_text):
    '''
    splits 'text' into list of substrings demarcated by 'splitter_text'
    substrings exclude 'splitter_text', specified leading and trailing
        characters in 'start_exclude' and 'end_exclude', and leading and
        trailing whitespaces
    some text descriptions are repeated; in every case, the text
        'dialogue-text' is included in 'text' and demarcates the start of the
        repetition
    '''

    double_marker = 'dialogue-text'
    double_removed = text.split(double_marker)[0]       # excludes repetition
    split_text = double_removed.split(splitter_text)

    cleaned_text = []
    min_len = 4             # minimum length of string for inclusion
    start_exclude = '>)'    # characters to exclude from start of substrings
    end_exclude = '<('      # characters to exclude from end of substrings

    for i in range(len(split_text)):
        inclusion_start = 0
        if len(split_text[i]) >= min_len:
            if split_text[i][0] in start_exclude:
                inclusion_start = 1
            if split_text[i][-1] in end_exclude:
                cleaned_text.append( split_text[i][inclusion_start:-1] )
            else:
                cleaned_text.append( split_text[i][inclusion_start:] )

    # remove leading and trailing whitespace
    for i in range(len(cleaned_text)):
        cleaned_text[i] = cleaned_text[i].strip()

    return(cleaned_text)


def prepare_text(table_filepath):
    '''
    input:  path to the 'csv' file with the names (1st column) and text
        descriptions (2nd column) in a table
    this function cleans the text descriptions by removing repetitions,
        splitting the descriptions by HTML tags that demarcate panels,
        removing extra leading and trailing punctuation and whitespace, and
        saving the original table along with the number of panels (3rd column)
        and cleaned text in a list (4th column) into a new table in a 'csv' file
    '''

    import pandas as pd

    # '^' used as separator because it does not appear in any text descriptions
    table = pd.read_csv(table_filepath, sep='^')

    text_col = 1
    text_by_row = []
    splitter = 'BR'     # HTML tag that demarcates descriptions of each panel
    number_panels = []

    for i in range(len(table)):
        text = table.iloc[i, text_col]
        if not isinstance(text, str):   # error handling for missing description
            text = ''
        text_row = split_text_by_substring(text, splitter)
        text_by_row.append(text_row)
        number_panels.append(len(text_row))

    table['num_panels'] = number_panels
    table['text_by_panels'] = text_by_row
    table.to_csv('table.csv', sep='^', index=False)


def main():
    '''
    Modifies 'csv' file of table of text descriptions (2nd column) and their
        names (1st column) and saves modified table in current working directory
    Table modifications (from 'prepare_text' comments):  cleans the text
        descriptions by removing repetitions, splitting the descriptions by HTML
        tags that demarcate panels, removing extra leading and trailing
        punctuation and whitespace, and saving the original table along with the
        number of panels (3rd column) and cleaned text in a list (4th column)
        into a new table in a 'csv' file
    Some text descriptions are repeated; in every case, the text
        'dialogue-text' is included in the description and demarcates the start
        of the repetition
    '''

    import os

    table_folder = '03_extract_text'
    table_file = 'table.csv'
    table_filepath = os.path.join(get_sibling_directory_path(table_folder),
                                  table_file)

    prepare_text(table_filepath)


if __name__ == '__main__':
    main()