Following are the steps to create indexing using Apache Lucene Search API
- Create IndexWriter
- For each input
- Create a Document
- Add Fields to the Document
- Add the Document to the IndexWriter
- Close the IndexWriter
- Optimize (optional)
Sample Example:
package com.techyhouse.lucene;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import java.io.*;
import java.util.ArrayList;
public class LuceneTextFileIndexer {
private IndexWriter writer;
private ArrayList<File> queue = new ArrayList<File>();
public static void main(String[] args) throws IOException {
System.out.println("Enter the path where the index will be created: ");
BufferedReader br = new BufferedReader(
new InputStreamReader(System.in));
String s = br.readLine();
TextFileIndexer indexer = null;
try {
indexer = new TextFileIndexer(s);
} catch (Exception ex) {
System.out.println("Cannot create index..." + ex.getMessage());
System.exit(-1);
}
while (!s.equalsIgnoreCase("q")) {
try {
System.out.println("Enter the file or folder name to add into the index (q=quit):");
System.out.println("[Acceptable file types: .xml, .html, .html, .txt]");
s = br.readLine();
if (s.equalsIgnoreCase("q")) {
break;
}
//try to add file into the index
indexer.indexFileOrDirectory(s);
} catch (Exception e) {
System.out.println("Error indexing " + s + " : " + e.getMessage());
}
}
//===================================================
//after adding, we always have to call the
//closeIndex, otherwise the index is not created
//===================================================
indexer.closeIndex();
}
/**
* Constructor
* @param indexDir the name of the folder in which the index should be created
* @throws java.io.IOException
*/
TextFileIndexer(String indexDir) throws IOException {
// the boolean true parameter means to create a new index everytime,
// potentially overwriting any existing files there.
FSDirectory dir = FSDirectory.open(new File(indexDir));
writer = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_CURRENT),
true, IndexWriter.MaxFieldLength.LIMITED);
}
/**
* Indexes a file or directory
* @param fileName the name of a text file or a folder we wish to add to the index
* @throws java.io.IOException
*/
public void indexFileOrDirectory(String fileName) throws IOException {
//===================================================
//gets the list of files in a folder (if user has submitted
//the name of a folder) or gets a single file name (is user
//has submitted only the file name)
//===================================================
listFiles(new File(fileName));
int originalNumDocs = writer.numDocs();
for (File f : queue) {
FileReader fr = null;
try {
Document doc = new Document();
//===================================================
// add contents of file
//===================================================
fr = new FileReader(f);
doc.add(new Field("contents", fr));
//===================================================
//adding second field which contains the path of the file
//===================================================
doc.add(new Field("path", fileName,
Field.Store.YES,
Field.Index.NOT_ANALYZED));
writer.addDocument(doc);
System.out.println("Added: " + f);
} catch (Exception e) {
System.out.println("Could not add: " + f);
} finally {
fr.close();
}
}
int newNumDocs = writer.numDocs();
System.out.println("");
System.out.println("************************");
System.out.println((newNumDocs - originalNumDocs) + " documents added.");
System.out.println("************************");
queue.clear();
}
private void listFiles(File file) {
if (!file.exists()) {
System.out.println(file + " does not exist.");
}
if (file.isDirectory()) {
for (File f : file.listFiles()) {
listFiles(f);
}
} else {
String filename = file.getName().toLowerCase();
//===================================================
// Only index text files
//===================================================
if (filename.endsWith(".htm") || filename.endsWith(".html") ||
filename.endsWith(".xml") || filename.endsWith(".txt")) {
queue.add(file);
} else {
System.out.println("Skipped " + filename);
}
}
}
/**
* Close the index.
* @throws java.io.IOException
*/
public void closeIndex() throws IOException {
writer.optimize();
writer.close();
}
}
package com.techyhouse.lucene;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import java.io.*;
import java.util.ArrayList;
public class LuceneTextFileIndexer {
private IndexWriter writer;
private ArrayList<File> queue = new ArrayList<File>();
public static void main(String[] args) throws IOException {
System.out.println("Enter the path where the index will be created: ");
BufferedReader br = new BufferedReader(
new InputStreamReader(System.in));
String s = br.readLine();
TextFileIndexer indexer = null;
try {
indexer = new TextFileIndexer(s);
} catch (Exception ex) {
System.out.println("Cannot create index..." + ex.getMessage());
System.exit(-1);
}
while (!s.equalsIgnoreCase("q")) {
try {
System.out.println("Enter the file or folder name to add into the index (q=quit):");
System.out.println("[Acceptable file types: .xml, .html, .html, .txt]");
s = br.readLine();
if (s.equalsIgnoreCase("q")) {
break;
}
//try to add file into the index
indexer.indexFileOrDirectory(s);
} catch (Exception e) {
System.out.println("Error indexing " + s + " : " + e.getMessage());
}
}
//===================================================
//after adding, we always have to call the
//closeIndex, otherwise the index is not created
//===================================================
indexer.closeIndex();
}
/**
* Constructor
* @param indexDir the name of the folder in which the index should be created
* @throws java.io.IOException
*/
TextFileIndexer(String indexDir) throws IOException {
// the boolean true parameter means to create a new index everytime,
// potentially overwriting any existing files there.
FSDirectory dir = FSDirectory.open(new File(indexDir));
writer = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_CURRENT),
true, IndexWriter.MaxFieldLength.LIMITED);
}
/**
* Indexes a file or directory
* @param fileName the name of a text file or a folder we wish to add to the index
* @throws java.io.IOException
*/
public void indexFileOrDirectory(String fileName) throws IOException {
//===================================================
//gets the list of files in a folder (if user has submitted
//the name of a folder) or gets a single file name (is user
//has submitted only the file name)
//===================================================
listFiles(new File(fileName));
int originalNumDocs = writer.numDocs();
for (File f : queue) {
FileReader fr = null;
try {
Document doc = new Document();
//===================================================
// add contents of file
//===================================================
fr = new FileReader(f);
doc.add(new Field("contents", fr));
//===================================================
//adding second field which contains the path of the file
//===================================================
doc.add(new Field("path", fileName,
Field.Store.YES,
Field.Index.NOT_ANALYZED));
writer.addDocument(doc);
System.out.println("Added: " + f);
} catch (Exception e) {
System.out.println("Could not add: " + f);
} finally {
fr.close();
}
}
int newNumDocs = writer.numDocs();
System.out.println("");
System.out.println("************************");
System.out.println((newNumDocs - originalNumDocs) + " documents added.");
System.out.println("************************");
queue.clear();
}
private void listFiles(File file) {
if (!file.exists()) {
System.out.println(file + " does not exist.");
}
if (file.isDirectory()) {
for (File f : file.listFiles()) {
listFiles(f);
}
} else {
String filename = file.getName().toLowerCase();
//===================================================
// Only index text files
//===================================================
if (filename.endsWith(".htm") || filename.endsWith(".html") ||
filename.endsWith(".xml") || filename.endsWith(".txt")) {
queue.add(file);
} else {
System.out.println("Skipped " + filename);
}
}
}
/**
* Close the index.
* @throws java.io.IOException
*/
public void closeIndex() throws IOException {
writer.optimize();
writer.close();
}
}
No comments:
Post a Comment