Index Text Files with Python for Rapid Information Retrieval

Introduction 

The technique known as "indexing" plays a fundamental role in search engines like Google and Yahoo, and can help researchers rapidly expedite their data analysis. This recipe will describe the steps one can follow in order to index data with the Python package Whoosh.  

Ingredients 
  • Python 2.7
  • Pip
  • Unix Shell or Mingw on Windows
Exercise Steps 

1. Install Whoosh, the NLTK, and Beautiful Soup, the packages we’ll use to index the text files. To install these packages, try opening up a Unix shell or Mingw terminal and running:

pip install whoosh
pip install nltk
pip install beautifulsoup4
import codecs, urllib2, os, time

# Specify the name of the directory in which we'll store the sample text collection
if not os.path.exists("sample_text_collection"):
    os.makedirs("sample_text_collection")

# Create a list of urls from which we'll extract full texts
urls_to_ping = ['http://www.gutenberg.org/files/829/829-0.txt','http://www.gutenberg.org/files/521/521-0.txt','http://www.gutenberg.org/cache/epub/2160/pg2160.txt']

# Loop over those urls, collecting the text from each, and writing limited metadata fields to disk
with codecs.open("sample_text_collection_metadata.txt","w","utf-8") as metadata_out:
	for url in urls_to_ping:
		response = urllib2.urlopen(url)
		html = response.read().decode('utf-8')

		# Extract metadata features from each file
		author_name = html.split("Author:")[1].replace("\r","").split("\n")[0]
		text_title  = html.split("Title:")[1].replace("\r","").split("\n")[0]
		filename    = url.split("/")[-1]
		metadata_out.write( filename + "\t" + author_name + "\t" + text_title + "\n")

		with codecs.open("sample_text_collection/" + filename, "w", "utf-8") as file_out:
			file_out.write( html )

		# Pause the script for 2 seconds to throttle the requests to Project Gutenberg
		time.sleep( 2 )

2. Collect texts to be indexed.

3. Define index schema:

from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID
from whoosh.analysis import StandardAnalyzer
from string import maketrans, punctuation
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import codecs, sys, glob, os, unicodedata

# Define stopwords using NLTK standard stopword list for English language
stopset = set(stopwords.words('english'))

# Load unicode punctuation
tbl = dict.fromkeys(i for i in xrange(sys.maxunicode)
                      if unicodedata.category(unichr(i)).startswith('P'))

# Create function with which to strip punct from unicode
def remove_punctuation(unicode_text):
    return unicode_text.translate(tbl)

# Make the index folder
if not os.path.exists("index_for_sample_files"):
    os.mkdir("index_for_sample_files")

# Specify a list of paths that contain all of the texts we wish to index
text_dirs = ["sample_text_collection"]

# Create metadata dict
metadata_dict = {}
with codecs.open("sample_text_collection_metadata.txt","r","utf-8") as metadata_in:
    metadata_rows = metadata_in.read().split("\n")
    for row in metadata_rows[:-1]:
        split_row        = row.split("\t")
        filename         = split_row[0].strip()
        author           = split_row[1].strip()
        short_title      = split_row[2].strip()
        
        if filename not in metadata_dict.keys():
            metadata_dict[filename] = {}
            metadata_dict[filename]["author"]          = author
            metadata_dict[filename]["short_title"]     = short_title			
			
# Create function that will allow one to retrieve metadata fields for a given filename
def retrieve_metadata(filename):
    if filename in metadata_dict.keys():
        return metadata_dict[filename]
    else:
        return None
        print "no metadata for: ", filename
        
# Identify the schema we'll use when creating index
schema = Schema(  filename=TEXT(stored=True), path=TEXT(stored=True),
author=TEXT(stored=True), short_title=TEXT(stored=True), full_text=TEXT( stored=True,phrase=True,analyzer=StandardAnalyzer(stoplist=None) )   )


4. Create index:

# Create the index using the schema defined above
ix = create_in("index_for_sample_files", schema)

writer = ix.writer()

for i in text_dirs:
    for j in glob.glob(i + "/*.txt"):       
        with codecs.open(j,"r","utf-8") as raw_file:
			
			cleaner_file = remove_punctuation( raw_file.read()
                           .replace("\r","").replace("\n"," ") )
					
            # Grab filename, then use that to grab all metadata. NB:
            # Unix users should change the following line to j.split("/")[-1]
			filename        = j.split("\\")[-1]
			print "indexing file: ",filename
			
			path            = j.decode("utf-8")
			file_metadata   = retrieve_metadata(filename)
			author          = file_metadata["author"].decode("utf-8")
			short_title     = file_metadata["short_title"].decode("utf-8")
			
			# Now push full text and metadata fields to the index
			writer.add_document(filename = unicode(filename), path=path,
            author=author, short_title=short_title, full_text=cleaner_file )
            
# Commit changes to index
writer.commit()


5. Prepare to search the index:

from whoosh import highlight
from whoosh.index import open_dir
from whoosh.query import Phrase, Term, spans
from whoosh.highlight import SentenceFragmenter
from nltk import ngrams
from bs4 import BeautifulSoup
import codecs, collections, subprocess, urllib2

class CustomScorer(highlight.FragmentScorer):
    def __init__(self, phrase):
        # Get the list of words from the phrase query
        self.words = phrase.words

    def __call__(self, f):
        # Create a dictionary mapping words to the positions the word
        # occurs at, e.g. "foo" -> [1, 5, 10]
        d = collections.defaultdict(list)
        for token in f.matches:
            d[token.text].append(token.pos)

        # For each position the first word appears at, check to see if the
        # rest of the words appear in order at the subsequent positions
        firstword = self.words[0]
        for pos in d[firstword]:
            found = False
            for word in self.words[1:]:
                pos += 1
                if pos not in d[word]:
                    break
            else:
                found = True

            if found:
                return 100
        return 0

# One can search this index in many ways. Let's read in the novel Tom Jones 
# and see which  trigrams from that file appear in any of our indexed files.
response = urllib2.urlopen("http://www.gutenberg.org/cache/epub/6593/pg6593.txt")
html = response.read().decode('utf-8')

with codecs.open("tom_jones.txt","w","utf-8") as tom_jones_out:
	tom_jones_out.write(html)


6. Search the index:

with codecs.open("tom_jones.txt","r","utf-8") as tom_jones_in:
	tom_jones_trigrams = ngrams(tom_jones_in.read().replace("\r", "")
    .replace("\n", " ").split("PROJECT GUTENBERG")[1].split("PROJECT GUTENBERG")[0].split(), 3)
	
with codecs.open("matching_searches.txt","w","utf-8") as matching_searches_out:	
	ix = open_dir("index_for_sample_files")
	with ix.searcher() as searcher:
		for trigram in tom_jones_trigrams:

			phrase_query                 = Phrase("full_text", trigram)
			results                      = searcher.search(phrase_query)
			results.fragmenter.charlimit = None
			results.scorer               = CustomScorer(phrase_query)
			for hit in results:			
					
				# We've identified at least one hit in our index. Whoosh contains a built-in 
				# set of tools we can use to "highlight" those hits, but we can also grep the 
				# files with hits to extract the matching string in context
				file_with_hit            = hit["path"]
				author_of_hit_file       = hit["author"]
				title_of_hit_file        = hit["short_title"]

				with codecs.open( hit["path"], "r", "utf-8") as fileobj:
					filecontents         = fileobj.read()
					hit_highlights       = hit.highlights("full_text", text=filecontents, top=100000)
					
					# A single hit highlights object can contain multiple hits separated by an ellipsis. 
					# Make sure you get them all:
					hit_list             = BeautifulSoup(hit_highlights).get_text().split("...")
					for hit in hit_list:
						clean_hit            = "..." + " ".join(x for x in hit.split()) + "..."
				
						matching_searches_out.write( u" ".join(x for x in trigram) + "\t" + author_of_hit_file + "\t" + title_of_hit_file + "\t" + file_with_hit + "\t" + clean_hit + "\n" )
Further Information