Here are the Anaconda ‘environment.yml’ specifications:
name: pandas dependencies: - mkl=2017.0.1=0 - numpy=1.13.0=py36_0 - openssl=1.0.2l=0 - pandas=0.20.2=np113py36_0 - pip=9.0.1=py36_1 - python=3.6.1=2 - python-dateutil=2.6.0=py36_0 - pytz=2017.2=py36_0 - readline=6.2=2 - setuptools=27.2.0=py36_0 - six=1.10.0=py36_0 - sqlite=3.13.0=0 - tk=8.5.18=0 - wheel=0.29.0=py36_0 - xz=5.2.2=1 - zlib=1.2.8=3
Here is the code:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
#! /usr/bin/env python3 def get_sibling_directory_path(sibling_directory_name): ''' returns path for a specified folder that is in the same parent directory as the current working directory ''' import os current_path = os.getcwd() last_separator_position = current_path.rfind(os.sep) parent_directory_path = current_path[0:last_separator_position] sibling_directory_path = os.path.join(parent_directory_path, sibling_directory_name) return(sibling_directory_path) def split_text_by_substring(text, splitter_text): ''' splits 'text' into list of substrings demarcated by 'splitter_text' substrings exclude 'splitter_text', specified leading and trailing characters in 'start_exclude' and 'end_exclude', and leading and trailing whitespaces some text descriptions are repeated; in every case, the text 'dialogue-text' is included in 'text' and demarcates the start of the repetition ''' double_marker = 'dialogue-text' double_removed = text.split(double_marker)[0] # excludes repetition split_text = double_removed.split(splitter_text) cleaned_text = [] min_len = 4 # minimum length of string for inclusion start_exclude = '>)' # characters to exclude from start of substrings end_exclude = '<(' # characters to exclude from end of substrings for i in range(len(split_text)): inclusion_start = 0 if len(split_text[i]) >= min_len: if split_text[i][0] in start_exclude: inclusion_start = 1 if split_text[i][-1] in end_exclude: cleaned_text.append( split_text[i][inclusion_start:-1] ) else: cleaned_text.append( split_text[i][inclusion_start:] ) # remove leading and trailing whitespace for i in range(len(cleaned_text)): cleaned_text[i] = cleaned_text[i].strip() return(cleaned_text) def prepare_text(table_filepath): ''' input: path to the 'csv' file with the names (1st column) and text descriptions (2nd column) in a table this function cleans the text descriptions by removing repetitions, splitting the descriptions by HTML tags that demarcate panels, removing extra leading and trailing punctuation and whitespace, and saving the original table along with the number of panels (3rd column) and cleaned text in a list (4th column) into a new table in a 'csv' file ''' import pandas as pd # '^' used as separator because it does not appear in any text descriptions table = pd.read_csv(table_filepath, sep='^') text_col = 1 text_by_row = [] splitter = 'BR' # HTML tag that demarcates descriptions of each panel number_panels = [] for i in range(len(table)): text = table.iloc[i, text_col] if not isinstance(text, str): # error handling for missing description text = '' text_row = split_text_by_substring(text, splitter) text_by_row.append(text_row) number_panels.append(len(text_row)) table['num_panels'] = number_panels table['text_by_panels'] = text_by_row table.to_csv('table.csv', sep='^', index=False) def main(): ''' Modifies 'csv' file of table of text descriptions (2nd column) and their names (1st column) and saves modified table in current working directory Table modifications (from 'prepare_text' comments): cleans the text descriptions by removing repetitions, splitting the descriptions by HTML tags that demarcate panels, removing extra leading and trailing punctuation and whitespace, and saving the original table along with the number of panels (3rd column) and cleaned text in a list (4th column) into a new table in a 'csv' file Some text descriptions are repeated; in every case, the text 'dialogue-text' is included in the description and demarcates the start of the repetition ''' import os table_folder = '03_extract_text' table_file = 'table.csv' table_filepath = os.path.join(get_sibling_directory_path(table_folder), table_file) prepare_text(table_filepath) if __name__ == '__main__': main() |