Here are the Anaconda ‘environment.yml’ specifications:
name: make_corpus_fast dependencies: - boto=2.48.0=py36_0 - bz2file=0.98=py36_0 - cython=0.26=py36_0 - decorator=4.1.2=py36_0 - gensim=2.3.0=np113py36_0 - ipython=6.1.0=py36_0 - ipython_genutils=0.2.0=py36_0 - jedi=0.10.2=py36_2 - libgfortran=3.0.0=1 - line_profiler=2.0=py36_0 - mkl=2017.0.3=0 - nltk=3.2.4=py36_0 - numpy=1.13.1=py36_0 - openssl=1.0.2l=0 - pandas=0.20.3=py36_0 - path.py=10.3.1=py36_0 - pexpect=4.2.1=py36_0 - pickleshare=0.7.4=py36_0 - pip=9.0.1=py36_1 - prompt_toolkit=1.0.15=py36_0 - ptyprocess=0.5.2=py36_0 - pygments=2.2.0=py36_0 - python=3.6.2=0 - python-dateutil=2.6.1=py36_0 - pytz=2017.2=py36_0 - readline=6.2=2 - requests=2.14.2=py36_0 - scipy=0.19.1=np113py36_0 - setuptools=27.2.0=py36_0 - simplegeneric=0.8.1=py36_1 - six=1.10.0=py36_0 - smart_open=1.5.3=py36_0 - sqlite=3.13.0=0 - tk=8.5.18=0 - traitlets=4.3.2=py36_0 - wcwidth=0.1.7=py36_0 - wheel=0.29.0=py36_0 - xz=5.2.3=0 - zlib=1.2.11=0 - pip: - html2text==2016.9.19 - ipython-genutils==0.2.0 - line-profiler==2.0 - prompt-toolkit==1.0.15 - smart-open==1.5.3
Here is the code:
|
#! /usr/bin/env python3 # moved these from function 'modify_text' to global variables for speed from html2text import html2text from nltk.tokenize import ToktokTokenizer from nltk.corpus import stopwords from nltk.stem.lancaster import LancasterStemmer def decompress_bz2_file(filename, decompressed_filename): ''' Decompresses 'bz2' file and saves it to a new file taken from: https://stackoverflow.com/questions/16963352/decompress-bz2-files ''' from bz2 import BZ2File as bz2_file with open(decompressed_filename, 'wb') as new_file, bz2_file(filename, 'rb') as file: for data in iter(lambda: file.read(100 * 1024), b''): new_file.write(data) def read_text_file(text_filename, as_string=False): ''' reads each line in a text file as a list item and returns list by default if 'as_string' is 'True', reads entire text file as a single string ''' text_list = [] try: with open(text_filename) as text: if as_string: # reads text file as single string text_list = text.read().replace('\n', '') else: # reads each line of text file as item in a list for line in text: text_list.append(line.rstrip('\n')) text.close() return(text_list) except: return('There was an error while trying to read the file') def write_list_to_text_file(a_list, text_file_name, overwrite_or_append='a'): ''' writes a list of strings to a text file appends by default; change to overwriting by setting to 'w' instead of 'a' ''' try: textfile = open(text_file_name, overwrite_or_append, encoding='utf-8') for element in a_list: textfile.write(element) textfile.write('\n') finally: textfile.close() def print_intermittent_status_message_in_loop(iteration, every_xth_iteration, total_iterations): ''' Prints a message updating the user on the progress of a loop ''' if iteration % every_xth_iteration == 0: import time print('Processing file {0} of {1}, which is {2:.0f}% at {3}' .format(iteration + 1, total_iterations, 100 * (iteration + 1) / total_iterations, time.ctime(int(time.time()))) ) def hms_string(sec_elapsed): ''' # downloaded from: # http://www.heatonresearch.com/2017/03/03/python-basic-wikipedia-parsing.html # https://github.com/jeffheaton/article-code/blob/master/python/wikipedia/wiki-basic-stream.py # Simple example of streaming a Wikipedia # Copyright 2017 by Jeff Heaton, released under the The GNU Lesser General Public License (LGPL). # http://www.heatonresearch.com ''' # Nicely formatted time string h = int(sec_elapsed / (60 * 60)) m = int((sec_elapsed % (60 * 60)) / 60) s = sec_elapsed % 60 return "{}:{:>02}:{:>05.2f}".format(h, m, s) def strip_tag_name(t): ''' # downloaded from: # http://www.heatonresearch.com/2017/03/03/python-basic-wikipedia-parsing.html # https://github.com/jeffheaton/article-code/blob/master/python/wikipedia/wiki-basic-stream.py # Simple example of streaming a Wikipedia # Copyright 2017 by Jeff Heaton, released under the The GNU Lesser General Public License (LGPL). # http://www.heatonresearch.com ''' t = t.tag idx = t.rfind("}") if idx != -1: t = t[idx + 1:] return t def process_wiki_xml(event, elem, tname, title, wiki_id, redirect, inrevision, ns, page_text, iter_count, save_pages=False): ''' Processes element extracted from Wikipedia XML file to allow classification of page into template, redirect, or article page and returns important information from page (e.g., page title, page Wikipedia ID number, article page text) # adapted from: # http://www.heatonresearch.com/2017/03/03/python-basic-wikipedia-parsing.html # https://github.com/jeffheaton/article-code/blob/master/python/wikipedia/wiki-basic-stream.py # Simple example of streaming a Wikipedia # Copyright 2017 by Jeff Heaton, released under the The GNU Lesser General Public License (LGPL). # http://www.heatonresearch.com # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ ''' if event == 'start': if tname == 'page': title = '' wiki_id = -1 redirect = '' inrevision = False ns = 0 elif tname == 'revision': inrevision = True else: if tname == 'title': if save_pages: title = elem.text # page title else: pass elif tname == 'id' and not inrevision: # excludes IDs for pages being revised if save_pages: wiki_id = int(elem.text) # Wikipedia page ID number else: pass elif tname == 'redirect': redirect = elem.attrib['title'] elif tname == 'ns': ns = int(elem.text) elif tname == 'text': if save_pages: page_text = elem.text # content from articles page else: pass return(ns, redirect, wiki_id, title, page_text) def count_index_articles(ns, redirect, print_status_interval, num_documents, sampling_interval, template_count, redirect_count, article_count, sampled_article_count): ''' Receives information from parsing through Wikipedia XML tree and counts the types of pages The 3 types of Wikipedia pages are: template, redirect, article 'sampled_article_count' - if only a portion of the article pages are being sampled (i.e., 'sampling_interval' > 1), this number will be lower than 'article_count' # adapted from: # http://www.heatonresearch.com/2017/03/03/python-basic-wikipedia-parsing.html # https://github.com/jeffheaton/article-code/blob/master/python/wikipedia/wiki-basic-stream.py # Simple example of streaming a Wikipedia # Copyright 2017 by Jeff Heaton, released under the The GNU Lesser General Public License (LGPL). # http://www.heatonresearch.com # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ ''' article_end = False if ns == 10: template_count += 1 elif len(redirect) > 0: redirect_count += 1 else: article_count += 1 print_intermittent_status_message_in_loop( article_count, print_status_interval, num_documents) if article_count % sampling_interval == 0: sampled_article_count += 1 article_end = True return(template_count, redirect_count, article_count, sampled_article_count, article_end) def modify_text(a_string): ''' Processes Wikipedia text for analysis: removes HTML tags, removes newline indicators, converts to lowercase, removes references, removes URLs, tokenizes, removes punctuations, removes stop words, removes numbers, stems Wikipedia text is input as a single string; each string is an article Returns a list of processed tokens Modifications have been made to the function to allow faster processing; original statements are retained as comments for portability and convenience when using the function in other situations ''' #import html2text #import nltk # moved imports from 'html2text' and 'nltk' to global variables for speed #from html2text import html2text #from nltk.tokenize import ToktokTokenizer #from nltk.corpus import stopwords #from nltk.stem.lancaster import LancasterStemmer html_2_text = html2text nltk_tok_tok = ToktokTokenizer nltk_stopwords = stopwords nltk_lancaster = LancasterStemmer import string import re #nltk.download('punkt') #nltk.download('all') a_string = a_string.split('=References=')[0] # remove references and everything afterwards #a_string = html2text.html2text(a_string).lower() # remove HTML tags, convert to lowercase a_string = html_2_text(a_string).lower() # remove HTML tags, convert to lowercase a_string = re.sub(r'https?:\/\/.*?[\s]', '', a_string) # remove URLs # 'word_tokenize' doesn't divide by '|' and '\n' # 'ToktokTokenizer' does divide by '|' and '\n', but retaining this # statement seems to improve its speed a little a_string = a_string.replace('|', ' ').replace('\n', ' ') #tokens = nltk.tokenize.word_tokenize(a_string) #tokenizer = nltk.tokenize.ToktokTokenizer() # tokenizes faster than 'word_tokenize' tokenizer = nltk_tok_tok() # tokenizes faster than 'word_tokenize' tokens = tokenizer.tokenize(a_string) #stop_words = nltk.corpus.stopwords.words('english') stop_words = nltk_stopwords.words('english') string_punctuation = list(string.punctuation) remove_items_list = stop_words + string_punctuation tokens = [w for w in tokens if w not in remove_items_list] tokens = [w for w in tokens if '=' not in w] # remove remaining tags and the like tokens = [w for w in tokens if not # remove tokens that are all digits or punctuation all(x.isdigit() or x in string_punctuation for x in w)] tokens = [w.strip(string.punctuation) for w in tokens] # remove stray punctuation attached to words tokens = [w for w in tokens if len(w) > 1] # remove single characters tokens = [w for w in tokens if not any(x.isdigit() for x in w)] # remove everything with a digit in it #stemmer = nltk.stem.PorterStemmer() #stemmer = nltk.stem.SnowballStemmer('english') #stemmer = nltk.stem.lancaster.LancasterStemmer() # fastest stemmer; results seem okay stemmer = nltk_lancaster() # fastest stemmer; results seem okay stemmed = [stemmer.stem(w) for w in tokens] return(stemmed) def create_sqlite_database(database_name, template_table_name, redirect_table_name, articles_table_name, articles_text_col_name, key_col_name, processing_id): ''' Creates sqlite database in which to store processed Wikipedia XML file Three tables are created in the database, one each for Wikipedia template pages, redirect pages, and article pages 'database_name' - name of the database to create 'template_table_name' - template pages table 'redirect_table_name' - redirect pages table 'articles_table_name' - articles pages table 'articles_text_col_name' - column in articles table in which text of each article is stored 'key_col_name' - column in articles table for primary key 'processing_id" - column in articles table for index obtained when extracting the article from the Wikipedia XML dump file; used only for creating database and not a piece of information that is contained in Wikipedia itself ''' import sqlite3 con = sqlite3.connect(database_name) cur = con.cursor() cur.execute('CREATE TABLE {t} (wiki_id INTEGER, title TEXT)' .format(t=template_table_name)) cur.execute('CREATE TABLE {t} (wiki_id INTEGER, title TEXT, redirect TEXT)' .format(t=redirect_table_name)) cur.execute('CREATE TABLE {t} ({k} INTEGER PRIMARY KEY, wiki_id INTEGER, ' 'title TEXT, {te} TEXT, {pid} INTEGER)' .format(t=articles_table_name, k=key_col_name, te=articles_text_col_name, pid=processing_id)) con.commit() con.close() def get_db_col_values(database_name, table_name, col_name, as_list=False): ''' Retrieves values of a column from a SQLite database table If 'as_list' = True, returns values as items in a list If 'as_list' = False, returns values as a list of tuples where the values are the first items in each tuple ''' import sqlite3 con = sqlite3.connect(database_name) cur = con.cursor() cur.execute('SELECT {pid} FROM {at}'.format(pid=col_name, at=table_name)) values = cur.fetchall() if as_list: values = [e[0] for e in values] con.commit() con.close() return(values) def insert_row_sqlite(database_name, table_name, values_list): ''' Inserts row into table of SQLite database 'database_name' - name of SQLite database 'table_name' - name of table to insert row into 'values_list' - list of the row's values to insert WARNING: do not use this function with an unsecured database; it is vulnerable to SQL injection attacks ''' import sqlite3 con = sqlite3.connect(database_name) cur = con.cursor() placeholders = ', '.join('?' * len(values_list)) cur.execute('INSERT INTO {t} VALUES ({p})' .format(t=table_name, p=placeholders), (values_list)) con.commit() con.close() def insert_rows_sqlite(database_name, table_name, values_list): ''' Inserts row into table of SQLite database 'database_name' - name of SQLite database 'table_name' - name of table to insert row into 'values_list' - list of the row's values to insert WARNING: do not use this function with an unsecured database; it is vulnerable to SQL injection attacks ''' import sqlite3 con = sqlite3.connect(database_name) cur = con.cursor() placeholders = ', '.join('?' * len(values_list[0])) #placeholders = [', '.join('?' * len(e)) for e in values_list] cur.executemany('INSERT INTO {t} VALUES ({p})' .format(t=table_name, p=placeholders), (values_list)) con.commit() con.close() def wiki_id_check(article_count, wiki_id): ''' Checks that 'wiki_id' is a valid integer; if not, returns error message ''' error_entry = None if not wiki_id: wiki_id = -1 error_message = 'missing wiki_id' error_entry = [error_message, article_count, wiki_id] elif not isinstance(wiki_id, int): try: wiki_id = int(wiki_id) error_message = 'wiki_id not an integer, conversion successful' error_entry = [error_message, article_count, wiki_id] except: error_message = 'wiki_id not an integer, conversion failed' error_entry = [error_message, article_count, wiki_id] return(wiki_id, error_entry) def title_check(article_count, wiki_id, title): ''' Checks that 'title' is a valid string; if not, returns error message ''' error_entry = None if not title: title = '' error_message = 'missing article title' error_entry = [error_message, article_count, wiki_id] elif not isinstance(title, str): try: title = str(title) error_message = 'title not a string, conversion successful' error_entry = [error_message, article_count, wiki_id, title] except: error_message = 'title not a string, conversion failed' error_entry = [error_message, article_count, wiki_id, title] return(title, error_entry) def save_wiki_to_sql(ns, redirect, print_status_interval, num_documents, rows, key_list, database_name, template_table_name, redirect_table_name, articles_table_name, template_count, redirect_count, article_count, sampled_article_count, wiki_id, title, page_text, iter_count): ''' # adapted from: # http://www.heatonresearch.com/2017/03/03/python-basic-wikipedia-parsing.html # https://github.com/jeffheaton/article-code/blob/master/python/wikipedia/wiki-basic-stream.py # Simple example of streaming a Wikipedia # Copyright 2017 by Jeff Heaton, released under the The GNU Lesser General Public License (LGPL). # http://www.heatonresearch.com # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ ''' # number of rows to insert into database at a time n_rows_to_save = 1000 error_log_filename = 'error_log_stream.txt' if ns == 10: template_count += 1 # commented out to save time #insert_row_sqlite(database_name, template_table_name, # [wiki_id, title]) elif len(redirect) > 0: redirect_count += 1 # commented out to save time #insert_row_sqlite(database_name, redirect_table_name, # [wiki_id, title, redirect]) else: article_count += 1 print_intermittent_status_message_in_loop( sampled_article_count, print_status_interval, num_documents) sampled_article_count += 1 if page_text: try: page_text = modify_text(page_text) page_text = ' '.join(page_text) except: error_entry = ['error in modify_text function', iter_count, article_count, wiki_id] write_list_to_text_file([str(error_entry)], error_log_filename) page_text = '' else: error_entry = ['missing article text', iter_count, article_count, wiki_id] write_list_to_text_file([str(error_entry)], error_log_filename) page_text = '' #wiki_id, error_entry = wiki_id_check(article_count, wiki_id) #if error_entry: # write_list_to_text_file([str(error_entry)], error_log_filename) #title, error_entry = title_check(article_count, wiki_id, title) #if error_entry: # write_list_to_text_file([str(error_entry)], error_log_filename) row_values = (key_list[sampled_article_count-1], wiki_id, title, page_text, iter_count) rows.append(row_values) if len(rows) >= n_rows_to_save or sampled_article_count == num_documents: try: insert_rows_sqlite(database_name, articles_table_name, rows) rows = [] except: error_items = [e[4] for e in rows] error_entry = ['database insert failed', error_items] write_list_to_text_file([str(error_entry)], error_log_filename) rows = [] #values_list = [key_list[sampled_article_count-1], # wiki_id, title, page_text] #insert_row_sqlite(database_name, articles_table_name, values_list) return(template_count, redirect_count, article_count, sampled_article_count, rows) def process_wiki(wiki_path, print_status_interval, num_documents, article_idx, database_names, key_list, sampling_interval=1, save_pages=False): ''' Processes a dumped Wikipedia XML file and stores the results in a SQLite database 'wiki_path' - file path to the Wikipedia XML file 'database_names' - names for database to create, template table, redirect table, articles table, articles table's text column, and article table's primary key column 'key_list' - list of integers to become the primary key for the table of articles in the database 'print_status_interval' - the number of articles to process before providing a status update message (e.g., print the message every 50 articles) 'num_documents' - the number of pages with articles in the Wikipedia XML file 'sampling_interval' - process only every Xth article page, where X is the sampling interval Wikipedia periodically provides its entire website in a 'dump': further information here: https://meta.wikimedia.org/wiki/Data_dumps https://dumps.wikimedia.org/enwiki/ This function divides the Wikipedia pages into template pages, redirect pages, and pages with articles; information on each is stored in its own table in the database Information on a template page includes its Wikipedia ID number and title Information on a redirect page includes its Wikipedia ID number, title, and and the title of the page that the user is redirected to Information on an article page includes its Wikipedia ID number, title, and the text of the page The text of an article page is processed for analysis by a call to the function 'modify_text' before being stored in the database The function returns the name of the articles table, the name of the column with the processed article text, and the name of the column with the primary key # adapted from: # http://www.heatonresearch.com/2017/03/03/python-basic-wikipedia-parsing.html # https://github.com/jeffheaton/article-code/blob/master/python/wikipedia/wiki-basic-stream.py # Simple example of streaming a Wikipedia # Copyright 2017 by Jeff Heaton, released under the The GNU Lesser General Public License (LGPL). # http://www.heatonresearch.com # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ ''' import xml.etree.ElementTree as etree import time if save_pages: database_name = database_names[0] template_table_name = database_names[1] redirect_table_name = database_names[2] articles_table_name = database_names[3] iter_count = 0 page_count = 0 article_count = 0 redirect_count = 0 template_count = 0 sampled_article_count = 0 title = '' wiki_id = -1 redirect = '' inrevision = False ns = 0 page_text = '' if save_pages: num_documents = len(article_idx) sampled_article_count = 0 rows = [] start_time = time.time() for event, elem in etree.iterparse(wiki_path, events=('start', 'end')): tname = strip_tag_name(elem) ns, redirect, wiki_id, title, page_text = process_wiki_xml( event, elem, tname, title, wiki_id, redirect, inrevision, ns, page_text, iter_count, save_pages) if tname == 'page' and event != 'start': page_count += 1 # error inspection / debugging #if article_count > 21999 and article_count < 22003: # write_list_to_text_file([page_text], 'wiki_article' + str(article_count) + '.txt', 'w') # instead of passing so many variables to and from the same # functions, probably better to create a class object so that it # can be passed and its state persists across loop iterations if not save_pages: (template_count, redirect_count, article_count, sampled_article_count, article_end) = count_index_articles( ns, redirect, print_status_interval, num_documents, sampling_interval, template_count, redirect_count, article_count, sampled_article_count) if article_end: article_idx.append(iter_count) elif iter_count in article_idx: #elif iter_count in article_idx and article_count > 60000: (template_count, redirect_count, article_count, sampled_article_count, rows) = save_wiki_to_sql( ns, redirect, print_status_interval, num_documents, rows, key_list, database_name, template_table_name, redirect_table_name, articles_table_name, template_count, redirect_count, article_count, sampled_article_count, wiki_id, title, page_text, iter_count) # save a few unmodified articles to text files for testing #if article_count < 12: # write_list_to_text_file([page_text], 'wiki_article' + str(article_count) + '.txt', 'w') elem.clear() iter_count += 1 elapsed_time = time.time() - start_time print('Number of template pages: {}'.format(template_count)) print('Number of redirect pages: {}'.format(redirect_count)) print('Number of article pages: {}'.format(article_count)) print('Number of total pages: {}'.format(page_count)) print('Elapsed time: {}'.format(hms_string(elapsed_time))) if not save_pages: return(article_idx) else: return() def split_list(a_list, n_parts): ''' Splits a list into (nearly) equal-sized parts and returns them as a list of lists taken from: https://stackoverflow.com/questions/2130016/splitting-a-list-of-into-n-parts-of-approximately-equal-length ''' d, r = divmod(len(a_list), n_parts) return(list(a_list[i * d + min(i, r):(i + 1) * d + min(i + 1, r)] for i in range(n_parts))) def pool_process_wiki(wiki_path, message_interval, article_idx, database_names, key_list, sampling_interval, save_pages, n_jobs): ''' Parallelizes processing of dumped Wikipedia XML file and storing of the results in a SQLite database 'wiki_path' - file path to the Wikipedia XML file 'message_interval' - the number of articles to process before providing a status update message (e.g., print the message every 50 articles) 'article_idx' - positional indices of articles in the XML file to be processed 'database_names' - names for database to create, template table, redirect table, articles table, articles table's text column, and article table's primary key column 'key_list' - list of integers to become the primary key for the table of articles in the database 'sampling_interval' - process only every Xth article page, where X is the sampling interval 'save_pages' - if 'True', saves pages to database; if 'False', only counts the pages 'n_jobs' - number of parallel jobs/processors to use ''' from itertools import repeat from multiprocessing import Pool article_idx_lists = split_list(article_idx, n_jobs) key_lists = split_list(key_list, n_jobs) params = zip(repeat(wiki_path), repeat(message_interval), repeat(0), # dummy value for 'num_documents' article_idx_lists, repeat(database_names), key_lists, repeat(sampling_interval), repeat(save_pages)) with Pool(processes=n_jobs) as pool: pool.starmap(process_wiki, params) def iter_documents_sqlite(database_name, table_name, col_name, key_col_name): ''' Iterates through database and returns each document (a list of tokens) ''' import sqlite3 con = sqlite3.connect(database_name) cur = con.cursor() cur.execute('SELECT COUNT(*) FROM {t}'.format(c=key_col_name, t=table_name)) response = cur.fetchall() num_documents = response[0][0] print('Number of documents in database is ', num_documents) try: for i in range(num_documents): cur.execute('SELECT ({c1}) FROM {t} WHERE {c2}=?' .format(c1=col_name, t=table_name, c2=key_col_name), (i, )) response = cur.fetchall() if response: document = response[0][0] else: document = '' yield(document.split()) finally: con.close() class TheCorpusFromSql(object): ''' Iterates through each document (a list of tokens) and creates a corpus and dictionary in accordance with the Gensim text analysis package ''' def __init__(self, database_name, table_name, col_name, key_col_name): from gensim.corpora import Dictionary self.database_name = database_name self.table_name = table_name self.col_name = col_name self.key_col_name = key_col_name self.dictionary = Dictionary(iter_documents_sqlite( database_name, table_name, col_name, key_col_name)) def __iter__(self): for document_tokens_list in iter_documents_sqlite( self.database_name, self.table_name, self.col_name, self.key_col_name): yield self.dictionary.doc2bow(document_tokens_list) def main(): ''' Creates and saves corpus and dictionary from Wikipedia XML file dump Wikipedia periodically provides its entire website in a 'dump': further information here: https://meta.wikimedia.org/wiki/Data_dumps https://dumps.wikimedia.org/enwiki/ Wikipedia XML dump download information: Main site version: 2017-07-02 19:52:45 done Recombine articles, templates, media/file descriptions, and primary meta-pages. enwiki-20170701-pages-articles.xml.bz2 13.1 GB 2017-07-02 16:48:31 done Articles, templates, media/file descriptions, and primary meta-pages. enwiki-20170701-pages-articles1.xml-p10p30302.bz2 156.8 MB Illinois mirror: ftp://ftpmirror.your.org/pub/wikimedia/dumps/enwiki/20170701/ Illinois mirror version: File:enwiki-20170701-pages-articles.xml.bz2 13784309 KB 7/2/17 7:52:00 PM File:enwiki-20170701-pages-articles1.xml-p10p30302.bz2 160604 KB 7/2/17 3:46:00 PM Downloads, July 13, 2017: Entire Wikipedia dump file: ftp://ftpmirror.your.org/pub/wikimedia/dumps/enwiki/20170701/enwiki-20170701-pages-articles.xml.bz2 Part of Wikipedia dump file, to use for smaller-scale testing: ftp://ftpmirror.your.org/pub/wikimedia/dumps/enwiki/20170701/enwiki-20170701-pages-articles1.xml-p10p30302.bz2 ''' import os import random import gensim import time start_time = time.time() # prepare, decompress Wikipedia XML dump file # ------------------------------------------------- filepath = os.getcwd() #filename = 'enwiki-20170701-pages-articles1.xml-p10p30302.bz2' # small part of Wikipedia filename = 'enwiki-20170701-pages-articles.xml.bz2' # all of Wikipedia compressed_path = os.path.join(filepath, filename) wiki_path = compressed_path.rsplit('.', 1)[0] #decompress_bz2_file(compressed_path, wiki_path) # count and index Wikipedia articles # ------------------------------------------------- message_interval = 100000 # print status update message every 'x' documents #est_num_docs = 15151 # estimated number of documents, small part of Wikipedia est_num_docs = 9019957 # estimated number of documents, all of Wikipedia sampling_interval = 3 article_idx_filename = 'article_idx.txt' article_idx_path = os.path.join(filepath, article_idx_filename) # if articles have already been counted and indexed, retrieve list if os.path.exists(article_idx_path): article_idx_str = read_text_file(article_idx_path) num_documents_all = len(article_idx_str) article_idx_str = article_idx_str[0::sampling_interval] article_idx = [int(e) for e in article_idx_str] # otherwise, count and index Wikipedia articles in Wikipedia XML dump file else: article_idx = [] article_idx = process_wiki(wiki_path, message_interval, est_num_docs, article_idx, '', [], sampling_interval) article_idx_str = [str(e) for e in article_idx] write_list_to_text_file(article_idx_str, article_idx_path, 'w') print('Wikipedia page indices loaded.') # count and index Wikipedia articles # ------------------------------------------------- # database names: names for database, template table, redirect table, # articles table, articles table's text column, article table's # primary key column, and processing ID for each article database_names = ['wiki_token_docs.sqlite', 'template', 'redirect', 'articles', 'text', 'key', 'processing_id'] database_path = os.path.join(filepath, database_names[0]) # if SQLite database doesn't already exist, create it if not os.path.exists(database_path): print('Creating database.') create_sqlite_database(database_names[0], database_names[1], database_names[2], database_names[3], database_names[4], database_names[5], database_names[6]) article_idx_done = [] # or if SQLite database does already exist, retrieve indices of articles # that have already been saved to it else: article_idx_done = get_db_col_values( database_names[0], database_names[3], database_names[6], True) article_idx_do = list(set(article_idx) - set(article_idx_done)) print('{} articles are already saved to the database.' .format(len(article_idx_done))) print('{} processed articles will be saved to the database.' .format(len(article_idx_do))) if len(article_idx_do) == 0: print('All articles have been added to the database.') return # process, save Wikipedia to SQLite database # ------------------------------------------------- # set up article indices and database table keys, excluding any already in # the database num_documents_do = len(article_idx_do) key_list_all = range(num_documents_all) key_list_used = get_db_col_values( database_names[0], database_names[3], database_names[5], True) key_list_available = set(key_list_all) - set(key_list_used) # randomizes order of Wikipedia articles when accessing by table primary key random.seed(513598) key_list = random.sample(key_list_available, len(key_list_available)) save_pages = True n_jobs = 4 # print status update message every 'x' documents message_interval = max(((num_documents_do / n_jobs) // 1000), 1) if n_jobs == 1: process_wiki(wiki_path, message_interval, num_documents_do, article_idx_do, database_names, key_list, sampling_interval, save_pages=True) else: pool_process_wiki(wiki_path, message_interval, article_idx_do, database_names, key_list, sampling_interval, save_pages, n_jobs) # delete de-compressed Wikipedia dump file # ------------------------------------------------- #os.remove(wiki_path) # create and save Gensim corpus and dictionary # ------------------------------------------------- wiki_corpus = TheCorpusFromSql(database_names[0], database_names[3], database_names[4], database_names[5]) #wiki_dictionary = wiki_corpus.dictionary wiki_corpus.dictionary.save('wiki_dictionary.dict') wiki_corpus.dictionary.save_as_text('wiki_dictionary.txt') gensim.corpora.MmCorpus.serialize('wiki_corpus.mm', wiki_corpus) # ------------------------------------------------- elapsed_time = time.time() - start_time print('Finished at {ct}'.format(ct=time.ctime(int(time.time())))) print('Elapsed time: {}'.format(hms_string(elapsed_time))) if __name__ == '__main__': main() |