lucene自定义同义词实现


lucene同义词搜索原理其实是根据PositionIncrementAttribute 和 CharTermAttribute的次元记录信息来实现的,当前使用lucene版本为4.8.0首先同义词要实现
packagelucene_index; importjava.io.IOException; importjava.util.Map; importjava.util.Stack; importorg.apache.lucene.analysis.TokenFilter; importorg.apache.lucene.analysis.TokenStream; importorg.apache.lucene.analysis.tokenattributes.CharTermAttribute; importorg.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; importorg.apache.lucene.util.AttributeSource; publicclassMySameFiterextendsTokenFilter{ Stack<String>stack=null; privateCharTermAttributecta=null;//词元信息 privatePositionIncrementAttributeposition=null;//词元位置信息 privateAttributeSource.Statecurrent;//记录当前的词元位置状态 privateMap<String,String[]>map;//同义词表 protectedMySameFiter(TokenStreaminput,Map<String,String[]>map){ super(input); stack=newStack<>(); cta=input.addAttribute(CharTermAttribute.class); position=input.addAttribute(PositionIncrementAttribute.class); this.map=map; } @Override publicbooleanincrementToken()throwsIOException{ //同义词操作 while(stack.size()>0){ Stringword=stack.pop(); restoreState(current); cta.setEmpty(); cta.append(word); position.setPositionIncrement(0); returntrue; } //判断是否有下一个分词 if(!input.incrementToken()){ returnfalse; } //获取当前的状态 if(getSameWrds(cta.toString())){ current=captureState(); } returntrue; } privatebooleangetSameWrds(Stringwords){ String[]arr=map.get(words); if(arr!=null){ for(Stringword:arr){ stack.push(word); } returntrue; } returnfalse; } } 自定义分词器
packagelucene_index; importjava.io.Reader; importjava.util.HashMap; importjava.util.Map; importorg.apache.lucene.analysis.Analyzer; importorg.apache.lucene.analysis.TokenStream; importorg.apache.lucene.analysis.Tokenizer; importorg.apache.lucene.analysis.core.StopAnalyzer; importorg.apache.lucene.analysis.core.StopFilter; importorg.apache.lucene.analysis.util.CharArraySet; importorg.apache.lucene.util.Version; importorg.wltea.analyzer.lucene.IKTokenizer; publicclassStopWrodsAnalyseextendsAnalyzer{ privateMap<String,String[]>map=newHashMap<String,String[]>(); //privateCharArraySetset=null; publicStopWrodsAnalyse(Map<String,String[]>map){ //for(Map.Entry<String,String[]>entry:map.entrySet()){ //set=StopFilter.makeStopSet(Version.LUCENE_48,entry.getValue(),true); //} //set.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET); this.map=map; } @Override protectedTokenStreamComponentscreateComponents(Stringwords,Readerreader){ Tokenizersource=newIKTokenizer(reader,false); TokenStreamstream=newMySameFiter(source,map);//将自定义的filter传入词库的话用ik的 //stream=newStopFilter(Version.LUCENE_48,stream,set); returnnewTokenStreamComponents(source,stream); } }
packagelucene_index; importjava.io.File; importjava.io.IOException; importjava.util.ArrayList; importjava.util.Collection; importjava.util.HashMap; importjava.util.Map; importjavax.print.Doc; importorg.apache.commons.io.FileUtils; importorg.apache.commons.io.LineIterator; importorg.apache.lucene.analysis.Analyzer; importorg.apache.lucene.document.Document; importorg.apache.lucene.document.Field; importorg.apache.lucene.document.StringField; importorg.apache.lucene.document.Field.Index; importorg.apache.lucene.document.Field.Store; importorg.apache.lucene.index.IndexReader; importorg.apache.lucene.index.IndexWriter; importorg.apache.lucene.index.IndexWriterConfig; importorg.apache.lucene.index.Term; importorg.apache.lucene.queryparser.classic.ParseException; importorg.apache.lucene.search.IndexSearcher; importorg.apache.lucene.search.Query; importorg.apache.lucene.search.ScoreDoc; importorg.apache.lucene.search.TermQuery; importorg.apache.lucene.search.TopDocs; importorg.apache.lucene.store.Directory; importorg.apache.lucene.store.FSDirectory; importorg.apache.lucene.util.Version; publicclassMainTest{ publicstaticvoidmain(String[]args)throwsIOException,ParseException{ LineIteratorit=FileUtils.lineIterator(newFile("E://searchwork_custom//data_index//ConfigFile//ExpansionWord.csv"),"gbk"); Map<String,String[]>map=newHashMap<String,String[]>(); while(it.hasNext()){ Stringword=it.nextLine(); String[]wordArr=word.replace("-,","").trim().split("\\,"); if(map.containsKey(wordArr[0])) continue; map.put(wordArr[0],wordArr); } Analyzeranalyzer=newStopWrodsAnalyse(map); Directorydirectory=FSDirectory.open(newFile("E:\\luceneindex")); IndexWriterConfigconfig=newIndexWriterConfig(Version.LUCENE_48,analyzer); IndexWriterwriter=newIndexWriter(directory,config); Collection<Document>coll=newArrayList<Document>(); for(Map.Entry<String,String[]>entry:map.entrySet()){ Documentdocss=newDocument(); Fieldfield=newField("name",entry.getKey(),Store.YES,Index.ANALYZED); docss.add(field); coll.add(docss); } writer.addDocuments(coll); writer.commit(); writer.close(); IndexSearchersearcher=newIndexSearcher(IndexReader.open(FSDirectory.open(newFile("E:\\luceneindex")))); //QueryParserparser=newQueryParser(Version.LUCENE_48,"name",analyzer); search(searcher); //WordInfo.getWordInfo(word,analyzer); } publicstaticvoidsearch(IndexSearchersearcher)throwsIOException{ Queryq=newTermQuery(newTerm("name","中国建设银行")); System.out.println(q); TopDocsdoc=searcher.search(q,10); ScoreDoc[]docs=doc.scoreDocs; for(inti=0;i<docs.length;i++){ Documentd=searcher.doc(docs[i].doc); System.out.println(d.get("name")); } } }
3.测试 当搜建行建设银行中国建设银行时建行或者建设银行时
优质内容筛选与推荐>>
1、U3D一些使用
2、centos挂载磁盘及扩展根目录
3、easyui datagrid添加移除editor的扩展方法
4、python使用requests库爬取网页的小实例:爬取京东网页
5、java设计模式——访问者模式


长按二维码向我转账

受苹果公司新规定影响,微信 iOS 版的赞赏功能被关闭,可通过二维码转账支持公众号。

    阅读
    好看
    已推荐到看一看
    你的朋友可以在“发现”-“看一看”看到你认为好看的文章。
    已取消,“好看”想法已同步删除
    已推荐到看一看 和朋友分享想法
    最多200字,当前共 发送

    已发送

    朋友将在看一看看到

    确定
    分享你的想法...
    取消

    分享想法到看一看

    确定
    最多200字,当前共

    发送中

    网络异常,请稍后重试

    微信扫一扫
    关注该公众号





    联系我们

    欢迎来到TinyMind。

    关于TinyMind的内容或商务合作、网站建议,举报不良信息等均可联系我们。

    TinyMind客服邮箱:support@tinymind.net.cn