Here are the Anaconda ‘environment.yml’ specifications:
name: pandas_bs4 dependencies: - beautifulsoup4=4.6.0=py36_0 - mkl=2017.0.1=0 - numpy=1.13.0=py36_0 - openssl=1.0.2l=0 - pandas=0.20.2=np113py36_0 - pip=9.0.1=py36_1 - python=3.6.1=2 - python-dateutil=2.6.0=py36_0 - pytz=2017.2=py36_0 - readline=6.2=2 - setuptools=27.2.0=py36_0 - six=1.10.0=py36_0 - sqlite=3.13.0=0 - tk=8.5.18=0 - wheel=0.29.0=py36_0 - xz=5.2.2=1 - zlib=1.2.8=3
Here is the code:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
#! /usr/bin/env python3 def get_soup(webpage_location): ''' input: location of a HTML webpage output: html parsed by Beautiful Soup module ''' import bs4 text = get_webpage(webpage_location) # function not shown soup = bs4.BeautifulSoup(text, 'html.parser') # parses html return(soup) def extract_text(soup): ''' Extracts text description of image from HTML input: HTML parsed by package Beautiful Soup output: the text description ''' image_text = '' image_html = soup.select('div div div div div div div a img') try: image_text = image_html[0].get('alt') except: image_text = 'ERROR: failed to extract text from html' return(image_text) def get_text_from_webpage(webpage_location): ''' input: location for a HTML webpage output: text description of image from HTML ''' soup = get_soup(webpage_location) text = extract_text(soup) return(text) def save_webpage_text_to_table(date_list, webpage_list): ''' input: list of dates and list of webpage locations output: saves 'csv' file with table of extracted descriptions of images (2nd column) and corresponding dates (1st column) ''' import pandas as pd message_interval = 100 text_table = pd.DataFrame(data={'pagename': date_list, 'text': ''}) for i in range(len(webpage_list)): # loop status message if (i % message_interval) == 0: print('Processing page {0} of {1}, which is {2:.0f}%' .format(i + 1, len(webpage_list), 100 * (i + 1) / len(webpage_list))) text_table.iloc[i, 1] = get_text_from_webpage(webpage_list[i]) # '^' used as separator because it does not appear in any text descriptions text_table.to_csv('table.csv', sep='^', index=False) def main(): ''' Extracts descriptions of images from HTML webpages and saves those descriptions in a table in a 'csv' file in the current working directory ''' # webpage for comic dated June 3, 1970 did not have a text description date_list, webpage_list = get_webpage_list() # function not shown # extract text descriptions of images from webpages and save them to a # table in a 'csv' file save_webpage_text_to_table(date_list, webpage_list) if __name__ == '__main__': main() |