Here are the Anaconda ‘environment.yml’ specifications:
name: pandas dependencies: - mkl=2017.0.1=0 - numpy=1.13.0=py36_0 - openssl=1.0.2l=0 - pandas=0.20.2=np113py36_0 - pip=9.0.1=py36_1 - python=3.6.1=2 - python-dateutil=2.6.0=py36_0 - pytz=2017.2=py36_0 - readline=6.2=2 - setuptools=27.2.0=py36_0 - six=1.10.0=py36_0 - sqlite=3.13.0=0 - tk=8.5.18=0 - wheel=0.29.0=py36_0 - xz=5.2.2=1 - zlib=1.2.8=3
Here is the code:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 |
#! /usr/bin/env python3 def get_sibling_directory_path(sibling_directory_name): ''' returns path for a specified folder that is in the same parent directory as the current working directory ''' import os current_path = os.getcwd() last_separator_position = current_path.rfind(os.sep) parent_directory_path = current_path[0:last_separator_position] sibling_directory_path = os.path.join(parent_directory_path, sibling_directory_name) return(sibling_directory_path) def print_full(x): ''' Taken from: https://stackoverflow.com/questions/19124601/is-there-a-way-to-pretty-print-the-entire-pandas-series-dataframe/19126566 ''' import pandas as pd pd.set_option('display.max_rows', len(x)) print(x) pd.reset_option('display.max_rows') def print_intermittent_status_message_in_loop(iteration, every_xth_iteration, total_iterations): ''' Prints message updating loop's progress for user ''' if iteration % every_xth_iteration == 0: print('Processing file {0} of {1}, which is {2:.0f}%' .format(iteration + 1, total_iterations, 100 * (iteration + 1) / total_iterations)) def read_table(table_filepath, column_of_lists): ''' reads table from 'csv' file each item in column 'column_of_lists' is read as a list; as currently written, the function can read only 1 column as a list ''' import pandas as pd from ast import literal_eval # '^' used as separator because it does not appear in any text descriptions table = pd.read_csv(table_filepath, sep='^', converters={column_of_lists[0]: literal_eval, column_of_lists[1]: literal_eval, column_of_lists[2]: literal_eval}) return(table) def read_text_file(text_filename, as_string=False): ''' reads each line in a text file as a list item and returns list by default if 'as_string' is 'True', reads entire text file as a single string ''' text_list = [] try: with open(text_filename) as text: if as_string: # reads text file as single string text_list = text.read().replace('\n', '') else: # reads each line of text file as item in a list for line in text: text_list.append(line.rstrip('\n')) text.close() return(text_list) except: return('There was an error while trying to read the file') def write_list_to_text_file(a_list, text_file_name, overwrite_or_append='a'): ''' writes a list of strings to a text file appends by default; change to overwriting by setting to 'w' instead of 'a' ''' try: textfile = open(text_file_name, overwrite_or_append, encoding='utf-8') for element in a_list: textfile.write(element) textfile.write('\n') finally: textfile.close() def expand_table(table): ''' In the original table/dataframe, several columns contain lists, all of which have an equal number of elements. This function expands the table vertically (i.e., by number of rows) so that each list element is in its own row. In this case, the table as 8 columns and column indices 3, 4, and 7 contain lists as elements. ''' import pandas as pd row = [] for i in range(len(table)): for j in range(0, len(table.iloc[i, 3])): row.append([table.iloc[i, 0], table.iloc[i, 1], table.iloc[i, 2], table.iloc[i, 3][j], table.iloc[i, 4][j], table.iloc[i, 5], table.iloc[i, 6], table.iloc[i, 7][j]]) expanded_table = pd.DataFrame(row, columns=table.columns) return(expanded_table) def find_substring_idx(a_string, substring): ''' returns starting and ending indices for a substring in 'a_string' if substring is empty (i.e., ''), returns lists of digits from zero to the length of 'a_string' ''' import re start_idx = [s.start() for s in re.finditer(substring, a_string)] #end_idx = [e + len(substring) for e in start_idx] # don't need 'end_idx' for this project #return(start_idx, end_idx) return(start_idx) def quotes_n_by_panel(expanded_table): ''' The 'no_quotes_n' and 'odd_quotes_n' variables in the original table gave an overall sum of the number of panels with no double-quotes or an odd number of quotes, respectively, since each row in the original table represented a single comic. In the expanded table, each row represents a single panel within a comic. This function adjusts these two quotes variables so that each row's value represents only that row/panel, instead of the entire comic. ''' message_interval = 1000 loop_len = len(expanded_table) quotes_ns = [] for i in range(loop_len): print_intermittent_status_message_in_loop(i, message_interval, loop_len) a_string = expanded_table.ix[i, 'text_spell_corrected'] quotes_idx = find_substring_idx(a_string, '"') quotes_n = len(quotes_idx) quotes_ns.append(quotes_n) if quotes_n == 0: # if there are no double quotes expanded_table.ix[i, 'no_quotes_n'] = 1 else: expanded_table.ix[i, 'no_quotes_n'] = 0 if (quotes_n % 2) != 0: # if there is an odd number of double quotes expanded_table.ix[i, 'odd_quotes_n'] = 1 else: expanded_table.ix[i, 'odd_quotes_n'] = 0 expanded_table['quotes_n'] = quotes_ns expanded_table.rename(columns={'no_quotes_n': 'no_quotes', 'odd_quotes_n': 'odd_quotes'}, inplace=True) expanded_table = expanded_table.iloc[:, [0, 1, 2, 3, 4, 8, 5, 6, 7]] return(expanded_table) def separate_talk(strings, quotes_idx): ''' Divides each string in 'strings' into lists of 'talk' and 'nontalk' 'Talk' substrings are quotes, that is, substrings that are enclosed in double-quotes; 'non-talk' is anything else in the string Inputs: 'strings' is a Pandas Series of strings; 'quotes_idx' is a Pandas Series with the same length as 'strings'; each element is composed of one or more lists, with one list for each occurrence of a 'talk' substring in the corresponding string; each list includes the positions of the starting and ending double quotes for the 'talk' substring; if there is no 'talk' substring in the string, the element contains one empty list Outputs: lists of 'talk' and 'nontalk', each the same length as 'strings', containing quoted and non-quoted (respectively) substrings of each string in 'strings'; if there are multiple substrings in an element of 'talk' (or 'nontalk'), each substring is a separate element in a list ''' nontalk = [] talk = [] message_interval = 1000 loop_len = len(strings) # for each panel for i in range(loop_len): print_intermittent_status_message_in_loop(i, message_interval, loop_len) panel_nontalk = [] panel_talk = [] # if there's no information on the positions of double quotes if not quotes_idx[i]: panel_nontalk.append([]) panel_talk.append([]) else: # for each occurrence of a 'talk' substring within a panel for j in range(len(quotes_idx[i])): string = strings[i] start_quote_pos = quotes_idx[i][j][1] end_quote_pos = quotes_idx[i][j][2] # if this is the first 'talk' substring if j == 0: panel_nontalk.append(string[0:start_quote_pos].strip()) # else (if this is not the first 'talk' substring) else: prior_end_quote_pos = quotes_idx[i][j-1][2] panel_nontalk.append(string[prior_end_quote_pos:start_quote_pos].strip()) # append the 'talk' substring panel_talk.append(string[start_quote_pos+1:end_quote_pos].strip()) # after all 'talk' substrings have been added, if there's still a # substring remaining at the end if end_quote_pos < len(string) - 1: # append it as a 'nontalk' substring panel_nontalk.append(string[end_quote_pos+1:len(string)].strip()) nontalk.append(panel_nontalk) talk.append(panel_talk) return(nontalk, talk) def main(): ''' Expands table of comic descriptions so that each row represents a panel in a comic instead of the entire comic (i.e., a comic with 4 panels is represented by 4 rows instead of 1 row) Also adds columns to table that divides the descriptions into speech (or thought) and non-speech (columns labeled 'text_talk and 'text_nontalk'); speech was identified by enclosure by double quotes ''' import os table_folder = '06_character_talk' table_file = 'table.csv' source_path = get_sibling_directory_path(table_folder) table_filepath = os.path.join(source_path, table_file) text_col_names = ['text_by_panels', 'text_spell_corrected', 'comics_speakers'] table = read_table(table_filepath, text_col_names) table.to_csv('table.csv', sep='^', index=False) expanded_table = expand_table(table) expanded_table = quotes_n_by_panel(expanded_table) expanded_table.to_csv('expanded_table.csv', sep='^', index=False) nontalk, talk = separate_talk(expanded_table.loc[:, 'text_spell_corrected'], expanded_table.loc[:, 'comics_speakers']) expanded_table['text_nontalk'] = nontalk expanded_table['text_talk'] = talk expanded_table.to_csv('expanded_table.csv', sep='^', index=False) if __name__ == '__main__': main() |