Extract Text

Here are the Anaconda ‘environment.yml’ specifications:

name: pandas_bs4
dependencies:
- beautifulsoup4=4.6.0=py36_0
- mkl=2017.0.1=0
- numpy=1.13.0=py36_0
- openssl=1.0.2l=0
- pandas=0.20.2=np113py36_0
- pip=9.0.1=py36_1
- python=3.6.1=2
- python-dateutil=2.6.0=py36_0
- pytz=2017.2=py36_0
- readline=6.2=2
- setuptools=27.2.0=py36_0
- six=1.10.0=py36_0
- sqlite=3.13.0=0
- tk=8.5.18=0
- wheel=0.29.0=py36_0
- xz=5.2.2=1
- zlib=1.2.8=3

Here is the code:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#! /usr/bin/env python3


def get_soup(webpage_location):
    '''
    input:  location of a HTML webpage
    output:  html parsed by Beautiful Soup module
    '''

    import bs4

    text = get_webpage(webpage_location)            # function not shown
    soup = bs4.BeautifulSoup(text, 'html.parser')   # parses html

    return(soup)


def extract_text(soup):
    '''
    Extracts text description of image from HTML
    input:  HTML parsed by package Beautiful Soup
    output:  the text description
    '''

    image_text = ''
    image_html = soup.select('div div div div div div div a img')

    try:
        image_text = image_html[0].get('alt')
    except:
        image_text = 'ERROR:  failed to extract text from html'

    return(image_text)


def get_text_from_webpage(webpage_location):
    '''
    input:  location for a HTML webpage
    output:  text description of image from HTML
    '''

    soup = get_soup(webpage_location)
    text = extract_text(soup)

    return(text)


def save_webpage_text_to_table(date_list, webpage_list):
    '''
    input:  list of dates and list of webpage locations
    output:  saves 'csv' file with table of extracted descriptions of images
        (2nd column) and corresponding dates (1st column)
    '''

    import pandas as pd

    message_interval = 100

    text_table = pd.DataFrame(data={'pagename': date_list, 'text': ''})

    for i in range(len(webpage_list)):

        # loop status message
        if (i % message_interval) == 0:
            print('Processing page {0} of {1}, which is {2:.0f}%'
                .format(i + 1, len(webpage_list),
                        100 * (i + 1) / len(webpage_list)))

        text_table.iloc[i, 1] = get_text_from_webpage(webpage_list[i])

    # '^' used as separator because it does not appear in any text descriptions
    text_table.to_csv('table.csv', sep='^', index=False)


def main():
    '''
    Extracts descriptions of images from HTML webpages and saves those
        descriptions in a table in a 'csv' file in the current working directory
    '''
    # webpage for comic dated June 3, 1970 did not have a text description

    date_list, webpage_list = get_webpage_list()    # function not shown

    # extract text descriptions of images from webpages and save them to a
    #       table in a 'csv' file
    save_webpage_text_to_table(date_list, webpage_list)


if __name__ == '__main__':
    main()