Friday, March 2, 2012

Lucene Text Search Engine: How to Index - Part I

Following are the steps to create indexing using Apache Lucene Search API
  • Create IndexWriter
  • For each input
  1. Create a Document
  2. Add Fields to the Document
  3. Add the Document to the IndexWriter
  4. Close the IndexWriter
  5. Optimize (optional)
Sample Example:
package com.techyhouse.lucene;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import java.io.*;
import java.util.ArrayList;

public class LuceneTextFileIndexer {

  private IndexWriter writer;
  private ArrayList<File> queue = new ArrayList<File>();

  public static void main(String[] args) throws IOException {
    System.out.println("Enter the path where the index will be created: ");

    BufferedReader br = new BufferedReader(
            new InputStreamReader(System.in));
    String s = br.readLine();

    TextFileIndexer indexer = null;
    try {
      indexer = new TextFileIndexer(s);
    } catch (Exception ex) {
      System.out.println("Cannot create index..." + ex.getMessage());
      System.exit(-1);
    }

    while (!s.equalsIgnoreCase("q")) {
      try {
        System.out.println("Enter the file or folder name to add into the index (q=quit):");
        System.out.println("[Acceptable file types: .xml, .html, .html, .txt]");
        s = br.readLine();
        if (s.equalsIgnoreCase("q")) {
          break;
        }

        //try to add file into the index
        indexer.indexFileOrDirectory(s);
      } catch (Exception e) {
        System.out.println("Error indexing " + s + " : " + e.getMessage());
      }
    }

    //===================================================
    //after adding, we always have to call the
    //closeIndex, otherwise the index is not created   
    //===================================================
    indexer.closeIndex();
  }

  /**
   * Constructor
   * @param indexDir the name of the folder in which the index should be created
   * @throws java.io.IOException
   */
  TextFileIndexer(String indexDir) throws IOException {
    // the boolean true parameter means to create a new index everytime,
    // potentially overwriting any existing files there.
    FSDirectory dir = FSDirectory.open(new File(indexDir));
    writer = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_CURRENT),
        true, IndexWriter.MaxFieldLength.LIMITED);
  }

  /**
   * Indexes a file or directory
   * @param fileName the name of a text file or a folder we wish to add to the index
   * @throws java.io.IOException
   */
  public void indexFileOrDirectory(String fileName) throws IOException {
    //===================================================
    //gets the list of files in a folder (if user has submitted
    //the name of a folder) or gets a single file name (is user
    //has submitted only the file name)
    //===================================================
    listFiles(new File(fileName));
   
    int originalNumDocs = writer.numDocs();
    for (File f : queue) {
      FileReader fr = null;
      try {
        Document doc = new Document();

        //===================================================
        // add contents of file
        //===================================================
        fr = new FileReader(f);
        doc.add(new Field("contents", fr));

        //===================================================
        //adding second field which contains the path of the file
        //===================================================
        doc.add(new Field("path", fileName,
                Field.Store.YES,
                Field.Index.NOT_ANALYZED));

        writer.addDocument(doc);
        System.out.println("Added: " + f);
      } catch (Exception e) {
        System.out.println("Could not add: " + f);
      } finally {
        fr.close();
      }
    }
   
    int newNumDocs = writer.numDocs();
    System.out.println("");
    System.out.println("************************");
    System.out.println((newNumDocs - originalNumDocs) + " documents added.");
    System.out.println("************************");

    queue.clear();
  }

  private void listFiles(File file) {
    if (!file.exists()) {
      System.out.println(file + " does not exist.");
    }
    if (file.isDirectory()) {
      for (File f : file.listFiles()) {
        listFiles(f);
      }
    } else {
      String filename = file.getName().toLowerCase();
      //===================================================
      // Only index text files
      //===================================================
      if (filename.endsWith(".htm") || filename.endsWith(".html") ||
              filename.endsWith(".xml") || filename.endsWith(".txt")) {
        queue.add(file);
      } else {
        System.out.println("Skipped " + filename);
      }
    }
  }

  /**
   * Close the index.
   * @throws java.io.IOException
   */
  public void closeIndex() throws IOException {
    writer.optimize();
    writer.close();
  }
}

No comments:

Post a Comment