Index Text Files with Python for Rapid Information Retrieval
Introduction
The technique known as "indexing" plays a fundamental role in search engines like Google and Yahoo, and can help researchers rapidly expedite their data analysis. This recipe will describe the steps one can follow in order to index data with the Python package Whoosh.
Ingredients
- Python 2.7
- Pip
- Unix Shell or Mingw on Windows
Exercise Steps
1. Install Whoosh, the NLTK, and Beautiful Soup, the packages we’ll use to index the text files. To install these packages, try opening up a Unix shell or Mingw terminal and running:
pip install whoosh pip install nltk pip install beautifulsoup4
import codecs, urllib2, os, time # Specify the name of the directory in which we'll store the sample text collection if not os.path.exists("sample_text_collection"): os.makedirs("sample_text_collection") # Create a list of urls from which we'll extract full texts urls_to_ping = ['http://www.gutenberg.org/files/829/829-0.txt','http://www.gutenberg.org/files/521/521-0.txt','http://www.gutenberg.org/cache/epub/2160/pg2160.txt'] # Loop over those urls, collecting the text from each, and writing limited metadata fields to disk with codecs.open("sample_text_collection_metadata.txt","w","utf-8") as metadata_out: for url in urls_to_ping: response = urllib2.urlopen(url) html = response.read().decode('utf-8') # Extract metadata features from each file author_name = html.split("Author:")[1].replace("\r","").split("\n")[0] text_title = html.split("Title:")[1].replace("\r","").split("\n")[0] filename = url.split("/")[-1] metadata_out.write( filename + "\t" + author_name + "\t" + text_title + "\n") with codecs.open("sample_text_collection/" + filename, "w", "utf-8") as file_out: file_out.write( html ) # Pause the script for 2 seconds to throttle the requests to Project Gutenberg time.sleep( 2 )
2. Collect texts to be indexed.
3. Define index schema:
from whoosh.index import create_in from whoosh.fields import Schema, TEXT, ID from whoosh.analysis import StandardAnalyzer from string import maketrans, punctuation from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import codecs, sys, glob, os, unicodedata # Define stopwords using NLTK standard stopword list for English language stopset = set(stopwords.words('english')) # Load unicode punctuation tbl = dict.fromkeys(i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('P')) # Create function with which to strip punct from unicode def remove_punctuation(unicode_text): return unicode_text.translate(tbl) # Make the index folder if not os.path.exists("index_for_sample_files"): os.mkdir("index_for_sample_files") # Specify a list of paths that contain all of the texts we wish to index text_dirs = ["sample_text_collection"] # Create metadata dict metadata_dict = {} with codecs.open("sample_text_collection_metadata.txt","r","utf-8") as metadata_in: metadata_rows = metadata_in.read().split("\n") for row in metadata_rows[:-1]: split_row = row.split("\t") filename = split_row[0].strip() author = split_row[1].strip() short_title = split_row[2].strip() if filename not in metadata_dict.keys(): metadata_dict[filename] = {} metadata_dict[filename]["author"] = author metadata_dict[filename]["short_title"] = short_title # Create function that will allow one to retrieve metadata fields for a given filename def retrieve_metadata(filename): if filename in metadata_dict.keys(): return metadata_dict[filename] else: return None print "no metadata for: ", filename # Identify the schema we'll use when creating index schema = Schema( filename=TEXT(stored=True), path=TEXT(stored=True), author=TEXT(stored=True), short_title=TEXT(stored=True), full_text=TEXT( stored=True,phrase=True,analyzer=StandardAnalyzer(stoplist=None) ) )
4. Create index:
# Create the index using the schema defined above ix = create_in("index_for_sample_files", schema) writer = ix.writer() for i in text_dirs: for j in glob.glob(i + "/*.txt"): with codecs.open(j,"r","utf-8") as raw_file: cleaner_file = remove_punctuation( raw_file.read() .replace("\r","").replace("\n"," ") ) # Grab filename, then use that to grab all metadata. NB: # Unix users should change the following line to j.split("/")[-1] filename = j.split("\\")[-1] print "indexing file: ",filename path = j.decode("utf-8") file_metadata = retrieve_metadata(filename) author = file_metadata["author"].decode("utf-8") short_title = file_metadata["short_title"].decode("utf-8") # Now push full text and metadata fields to the index writer.add_document(filename = unicode(filename), path=path, author=author, short_title=short_title, full_text=cleaner_file ) # Commit changes to index writer.commit()
5. Prepare to search the index:
from whoosh import highlight from whoosh.index import open_dir from whoosh.query import Phrase, Term, spans from whoosh.highlight import SentenceFragmenter from nltk import ngrams from bs4 import BeautifulSoup import codecs, collections, subprocess, urllib2 class CustomScorer(highlight.FragmentScorer): def __init__(self, phrase): # Get the list of words from the phrase query self.words = phrase.words def __call__(self, f): # Create a dictionary mapping words to the positions the word # occurs at, e.g. "foo" -> [1, 5, 10] d = collections.defaultdict(list) for token in f.matches: d[token.text].append(token.pos) # For each position the first word appears at, check to see if the # rest of the words appear in order at the subsequent positions firstword = self.words[0] for pos in d[firstword]: found = False for word in self.words[1:]: pos += 1 if pos not in d[word]: break else: found = True if found: return 100 return 0 # One can search this index in many ways. Let's read in the novel Tom Jones # and see which trigrams from that file appear in any of our indexed files. response = urllib2.urlopen("http://www.gutenberg.org/cache/epub/6593/pg6593.txt") html = response.read().decode('utf-8') with codecs.open("tom_jones.txt","w","utf-8") as tom_jones_out: tom_jones_out.write(html)
6. Search the index:
with codecs.open("tom_jones.txt","r","utf-8") as tom_jones_in: tom_jones_trigrams = ngrams(tom_jones_in.read().replace("\r", "") .replace("\n", " ").split("PROJECT GUTENBERG")[1].split("PROJECT GUTENBERG")[0].split(), 3) with codecs.open("matching_searches.txt","w","utf-8") as matching_searches_out: ix = open_dir("index_for_sample_files") with ix.searcher() as searcher: for trigram in tom_jones_trigrams: phrase_query = Phrase("full_text", trigram) results = searcher.search(phrase_query) results.fragmenter.charlimit = None results.scorer = CustomScorer(phrase_query) for hit in results: # We've identified at least one hit in our index. Whoosh contains a built-in # set of tools we can use to "highlight" those hits, but we can also grep the # files with hits to extract the matching string in context file_with_hit = hit["path"] author_of_hit_file = hit["author"] title_of_hit_file = hit["short_title"] with codecs.open( hit["path"], "r", "utf-8") as fileobj: filecontents = fileobj.read() hit_highlights = hit.highlights("full_text", text=filecontents, top=100000) # A single hit highlights object can contain multiple hits separated by an ellipsis. # Make sure you get them all: hit_list = BeautifulSoup(hit_highlights).get_text().split("...") for hit in hit_list: clean_hit = "..." + " ".join(x for x in hit.split()) + "..." matching_searches_out.write( u" ".join(x for x in trigram) + "\t" + author_of_hit_file + "\t" + title_of_hit_file + "\t" + file_with_hit + "\t" + clean_hit + "\n" )
Further Information
- Consult Whoosh documentation: https://pythonhosted.org/Whoosh/
- Indexing Overview: http://stackoverflow.com/questions/1108/how-does-database-indexing-work
- A comparison of open source search engines: http://wrg.upf.edu/WRG/dctos/Middleton-Baeza.pdf
Submitted by Jinman on Sat, 04/08/2017 - 11:04