First, I used the RAMDirectory for in-memory flavor, if you think the index can fit in memory the results will be really snappy, if you have massive content to be index then storing in a file system is the next step, if you run your container on Linux you can use then NIOFSDirectory so then the fs journaling is optimized via NIO (not on MS*s).
Second it's recommendable to always use a single instance of IndexWriter and Searcher over the index, specially if updating on the fly documents/fields, remember to optimize accordingly so then changes are committed.
Third, it is important to mention that QueryParser is not thread-safe.
Fourth, try to keep a single instance of Document | Field, use field.setValue() so then when indexing massive content you avoid GC deallocations of redundant instances.
And last but not least, StandardAnalyzer offers base token digestion but you can implement one as needed to customize the behavior.
Alright then, below are the impl details:
package com.martin.server;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
public class SearchService {
private static final org.slf4j.Logger log = org.slf4j.LoggerFactory.getLogger(SearchService.class);
private static final String TOKENIZED = "tok";
private RAMDirectory index = null;
private IndexWriter writer = null;
private Searcher searcher = null;
private Analyzer analyzer = null;
private IndexableDocument doc = null;
public SearchService() { }
@javax.annotation.PostConstruct
public void start() {
try {
index = new RAMDirectory();
// Directory index = NIOFSDirectory.getDirectory("/fs/.."); // NIO on linux :)
analyzer = new StandardAnalyzer(Version.LUCENE_30);
writer = new IndexWriter( index ,analyzer, true, IndexWriter.MaxFieldLength.LIMITED);// 10k terms max
//writer.setRAMBufferSizeMB(48); //IndexWriter.DEFAULT_RAM_BUFFER_SIZE_MB 16megs find your sweet-spot
//writer.setMaxFieldLength(40000); // adjust as needed
//writer.setMergeFactor(300); // adjust as needed
//writer.setMaxBufferedDocs(150); // adjust as needed in function of setRAMBufferSizeMB
//writer.setMaxMergeDocs(x) // adjust as needed if reached 2GB limit at 8M set it to 7M
log.info( "--==[ STARTED ]==--");
} catch (final Exception e) {
log.error("start()", e);
}
}
@javax.annotation.PreDestroy
public void stop() {
try {
analyzer.close();
searcher.close();
writer.close();
index.close();
log.info( "--==[ STOPPED ]==--");
} catch (final Exception e) {
log.error("stop()", e);
}
}
public void buildIndex() {
try {
writer.commit();
writer.optimize(true); // block till done
log.info( "Search.index.mem.size {} bytes" , index.sizeInBytes() );
log.info( "Search.writer.size.buffer {} megs", writer.getRAMBufferSizeMB() );
writer.close();
////////////////////////////////////
searcher = new IndexSearcher(index);
////////////////////////////////////
} catch (final Exception e) {
log.error("buildIndex()", e);
}
}
public void addIndexableDocument( final String id , final String name , final String content ) {
try {
if ( doc == null ) {
doc = new IndexableDocument();
}
doc.setFields( id , name , content );
//////////////////////////////////////
writer.addDocument(doc.getDocument());
//////////////////////////////////////
log.debug("search.index.added {}",doc.toString());
} catch (final Exception e) {
log.error("addDoc()", e);
}
}
public java.util.List<Document> query( final String text , final int max ) {
java.util.List<Document> results = null;
QueryParser parser = null;
try {
final String txt = QueryParser.escape( java.net.URLDecoder.decode( text ,"UTF-8") ); //sanitize accordingly
final String qtx = txt+"*"; // (txt + "* OR " + txt + "~"); // etc, craft your query here
// Note that QueryParser is not thread-safe !!
parser = new QueryParser(Version.LUCENE_30, TOKENIZED , analyzer );
// MultiFieldQueryParser(Version.LUCENE_30, String[] fields, analyzer);
results = doSearch( parser.parse( qtx ) , max ) ;
} catch (final Exception e) {
log.error("search()", e);
}
return results;
}
public java.util.List<Document> doSearch( final Query query , final int max ) throws Exception {
log.debug("search.query {}", query.toString() );
final java.util.List<Document> results = new java.util.ArrayList<Document>();
//final TopScoreDocCollector collector = TopScoreDocCollector.create( max , true );
////////////////////////////////////////////////
//searcher.search( query , collector );
final TopDocs hits = searcher.search(query,max);
////////////////////////////////////////////////
log.info("tothits {} , hits.scoreDocs.length {}", hits.totalHits , hits.scoreDocs.length );
if ( hits != null && hits.scoreDocs.length > 0 ) {
for( int i=0; i < hits.scoreDocs.length; i++) {
// log.debug("explain {} ",searcher.explain(query,hits[i].doc));
results.add( searcher.doc(hits.scoreDocs[i].doc) );
}
}
return results;
}
// customize as needed
public static class IndexableDocument {
public static final String ID = "i";
public static final String NM = "n";
private final Document doc;
private final Field id;
private final Field name;
private final Field tok;
public IndexableDocument() {
doc = new Document();
id = new Field(ID, "", Field.Store.YES, Field.Index.NOT_ANALYZED);
name = new Field(NM, "", Field.Store.YES, Field.Index.NOT_ANALYZED);
tok = new Field(TOKENIZED, new java.io.StringReader("") ); // not STORED + ANALYZED
doc.add(id);
doc.add(name);
doc.add(tok);
}
public Document getDocument() {
return doc;
}
public void setFields( final String i , final String n , final String tokens ) {
id.setValue(i);
name.setValue(n);
tok.setValue(new java.io.StringReader(tokens)); // use char[]
}
@Override
public String toString() {
return doc.toString();
}
}
}
And below is a quick test case just to depict usage:
@Test
public void searchQuery() {
try {
final SearchService search = new SearchService();
search.start();
for( int i=0; i < 1000; i++) {
search.addIndexableDocument(String.valueOf(i),"Denim"+String.valueOf(i) , "denim jeans shorts pant");
search.addIndexableDocument(String.valueOf(i+1),"Jackets"+String.valueOf(i) , "jackets collar long sleeve shorts sleeve ");
search.addIndexableDocument(String.valueOf(i+2),"Underwear"+String.valueOf(i), "underwear bra panties shorts");
search.addIndexableDocument(String.valueOf(i+3),"Sale"+String.valueOf(i) , "sale shoes denim jewlery accessories jeans pants shorts jackets watches t-shirts");
}
search.buildIndex();
final List<Document> results = search.query("short",30);
for( final Document doc : results ) {
log.info("[ {} ] = {} ",
doc.get(SearchService.IndexableDocument.ID),
doc.get(SearchService.IndexableDocument.NM) );
}
search.stop();
} catch (final Exception e) {
e.printStackTrace();
}
}
The id fields you choose to store in the index should be as direct and at minimum as possible, so then you can re-hydrate from cache or DB search results, see IndexableDocument and adjust accordingly based on your needs, wrap that into a restful API returning say json | xml format and voila! there you have it, or simply go with the Solr wrapper if you need more features, such as admin web interface, distributed index management which could be challenging if one thinks of rolling your own, etc ,etc. In a production environment you would have to account for many other important things, yet here's 'in-a-nut-shell' reference implementation.Till the next experiment!, cheers
Martin

0 comments:
Post a Comment