亚洲免费在线-亚洲免费在线播放-亚洲免费在线观看-亚洲免费在线观看视频-亚洲免费在线看-亚洲免费在线视频

lucene-索引的優(yōu)化和索引過(guò)程查看

系統(tǒng) 1572 0

lucene-索引的優(yōu)化和索引過(guò)程查看

lucene-索引的優(yōu)化和索引過(guò)程查看

代碼:(索引建立)

package bindex;

import java.io.IOException;
import java.io.PrintStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.beans.LinkBean;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NotFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.RegexFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class perfieldindextest {

/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
String indexpath="./indexes";

IndexWriter writer;
PerFieldAnalyzerWrapper wr;
Document doc;
try {
RAMDirectory rd=new RAMDirectory();
writer=new IndexWriter(rd,new StandardAnalyzer());
//限制第個(gè)field最大詞條數(shù)為100
writer.setMaxFieldLength(100);
writer.setInfoStream(System.out);//查看索引過(guò)程
wr=new PerFieldAnalyzerWrapper(new StandardAnalyzer());
wr.addAnalyzer("title",new MMAnalyzer());
wr.addAnalyzer("content", new MMAnalyzer());
wr.addAnalyzer("author", new MMAnalyzer());
wr.addAnalyzer("time", new StandardAnalyzer());
//提取騰迅國(guó)內(nèi)新聞鏈接
LinkBean lb=new LinkBean();
List baseurls=new ArrayList();
baseurls.add(" http://news.qq.com/china_index.shtml ");
baseurls.add(" http://news.qq.com/world_index.shtml ");
baseurls.add(" http://news.qq.com/society_index.shtml ");
for (int j=0;j<baseurls.size();j++){
lb.setURL((String)baseurls.get(j));
URL[] urls=lb.getLinks();
for (int i=0;i<urls.length;i++){
doc=new Document();
String title="";
String content="";
String time="";
String author="";
System.out.println("正在提取"+(String)baseurls.get(j)+"第"+i+"個(gè)鏈接("+(int)(100*(i+1)/urls.length)+"%)["+urls[i].toString()+"].....");
if (!(urls[i].toString().startsWith(" http://news.qq.com/a/ "))){
System.out.println("非新聞鏈接,忽略......");continue;
}
System.out.println("新聞鏈接,正在處理");
Parser parser=new Parser(urls[i].toString());
parser.setEncoding("GBK");
String url=urls[i].toString();
NodeFilter filter_title=new TagNameFilter("title");
NodeList nodelist=parser.parse(filter_title);
Node node_title=nodelist.elementAt(0);
title=node_title.toPlainTextString();
System.out.println("標(biāo)題:"+title);
parser.reset();
NodeFilter filter_auth=new OrFilter(new HasAttributeFilter("class","auth"),new HasAttributeFilter("class","where"));
nodelist=parser.parse(filter_auth);
Node node_auth=nodelist.elementAt(0);
if (node_auth != null) author=node_auth.toPlainTextString();
else author="騰訊網(wǎng)";
node_auth=nodelist.elementAt(1);
if (node_auth != null) author+=node_auth.toPlainTextString();
System.out.println("作者:"+author);
parser.reset();
NodeFilter filter_time=new OrFilter(new HasAttributeFilter("class","info"),new RegexFilter("[0-9]{4}年[0-9]{1,2}月[0-9]{1,2}日[' ']*[0-9]{1,2}:[0-9]{1,2}"));
nodelist=parser.parse(filter_time);
Node node_time=nodelist.elementAt(0);
if (node_time!=null) {
if (node_time.getChildren()!=null) node_time=node_time.getFirstChild();
time=node_time.toPlainTextString().replaceAll("[ |\t|\n|\f|\r\u3000]","").substring(0,16);
}

System.out.println("時(shí)間:"+time);
parser.reset();
NodeFilter filter_content=new OrFilter(new OrFilter(new HasAttributeFilter("style","TEXT-INDENT: 2em"),new HasAttributeFilter("id","Cnt-Main-Article-QQ")),new HasAttributeFilter("id","ArticleCnt"));
nodelist=parser.parse(filter_content);
Node node_content=nodelist.elementAt(0);
if (node_content!=null){
content=node_content.toPlainTextString().replaceAll("(#.*)|([a-z].*;)|}","").replaceAll(" |\t|\r|\n|\u3000","");
}
System.out.println("內(nèi)容:"+content);
System.out.println("正在索引.....");
Field field=new Field("title",title,Field.Store.YES,Field.Index.TOKENIZED);
doc.add(field);
field=new Field("content",content,Field.Store.YES,Field.Index.TOKENIZED);
doc.add(field);
field=new Field("author",author,Field.Store.YES,Field.Index.UN_TOKENIZED);
doc.add(field);
field=new Field("time",time,Field.Store.YES,Field.Index.NO);
doc.add(field);
field=new Field("url",url,Field.Store.YES,Field.Index.NO);
doc.add(field);
writer.addDocument(doc,new MMAnalyzer());
System.out.println("<"+title+"索引成功>");
}
}
writer.close();
wr.close();
FSDirectory fd=FSDirectory.getDirectory(indexpath);
IndexWriter wi=new IndexWriter(fd,new MMAnalyzer());
PrintStream ps=new PrintStream("log.txt");
wi.setInfoStream(ps);//將索引過(guò)程輸出到文件
wi.addIndexes(new Directory[]{rd});
//內(nèi)存中文檔最大值是40
wi.setMaxMergeDocs(80);
//內(nèi)存中存儲(chǔ)40個(gè)文檔時(shí)寫(xiě)成磁盤(pán)一個(gè)塊
wi.setMergeFactor(80);
System.out.println("<索引優(yōu)化中>");
//索引本身的優(yōu)化,多個(gè)索引文件合并成單個(gè)文件
wi.optimize();
wi.close();
System.out.println("<索引建立完畢>");
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (CorruptIndexException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

}

servlet:

package bservlet;

import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.*;

import java.io.*;

import jeasy.analysis.MMAnalyzer;


public class SluceneSearcher extends HttpServlet {
private String indexpath="D:/workspace/testsearch2/indexes";
public void doPost(HttpServletRequest request,HttpServletResponse response){
StringBuffer sb=new StringBuffer("");
try {
request.setCharacterEncoding("GBK");
String phrase=request.getParameter("phrase");
Analyzer analyzer=new MMAnalyzer();
IndexSearcher searcher;
searcher = new IndexSearcher(indexpath);
QueryParser parser=new QueryParser("content",analyzer);
Query q= parser.parse(phrase);
Hits hs=searcher.search(q);
int num=hs.length();
sb.append("<h1>您搜索到的記錄數(shù):"+num+"</h1>");
for (int i=0;i<num;i++){
Document doc=hs.doc(i);
if (doc==null){
continue;
}
Field field_title=doc.getField("title");
String title="<br><a href="+doc.getField("url").stringValue()+" target='_blank'>"+field_title.stringValue()+"</a><br>";
Field field_author=doc.getField("author");
String author="<br>author:<br>"+field_author.stringValue();
Field field_time=doc.getField("time");
String time="<br>time:<br>"+field_time.stringValue();
sb.append(title);
sb.append(author);
sb.append(time);
}
searcher.close();
} catch (CorruptIndexException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
PrintWriter out;
try {
response.setContentType("text/html;charset=GBK");
out = response.getWriter();
out.print(sb.toString());
out.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

}
public void doGet(HttpServletRequest request,HttpServletResponse response){
doPost(request,response);
}

}

lucene-索引的優(yōu)化和索引過(guò)程查看


更多文章、技術(shù)交流、商務(wù)合作、聯(lián)系博主

微信掃碼或搜索:z360901061

微信掃一掃加我為好友

QQ號(hào)聯(lián)系: 360901061

您的支持是博主寫(xiě)作最大的動(dòng)力,如果您喜歡我的文章,感覺(jué)我的文章對(duì)您有幫助,請(qǐng)用微信掃描下面二維碼支持博主2元、5元、10元、20元等您想捐的金額吧,狠狠點(diǎn)擊下面給點(diǎn)支持吧,站長(zhǎng)非常感激您!手機(jī)微信長(zhǎng)按不能支付解決辦法:請(qǐng)將微信支付二維碼保存到相冊(cè),切換到微信,然后點(diǎn)擊微信右上角掃一掃功能,選擇支付二維碼完成支付。

【本文對(duì)您有幫助就好】

您的支持是博主寫(xiě)作最大的動(dòng)力,如果您喜歡我的文章,感覺(jué)我的文章對(duì)您有幫助,請(qǐng)用微信掃描上面二維碼支持博主2元、5元、10元、自定義金額等您想捐的金額吧,站長(zhǎng)會(huì)非常 感謝您的哦!!!

發(fā)表我的評(píng)論
最新評(píng)論 總共0條評(píng)論
主站蜘蛛池模板: 99热这里只有精品1 99热这里只有精品18 | 快射视频网 | 亚洲午夜日韩高清一区 | 成人国产精品一级毛片视频 | 久久青草免费免费91线频观看 | 成人a毛片免费全部播放 | 天天操天天操天天干 | 麻豆一区二区三区四区 | 天天操天天爱天天干 | 高清一级毛片 | 亚洲成人高清在线观看 | 天天射天天干天天操 | 四虎影视在线麻豆国产 | 赛车总动员2在线观看 | 福利姬视频在线观看 | 永久免费的啪啪免费的网址 | 久久精品久久精品国产大片 | 国产亚洲精品视频中文字幕 | 欧美区视频 | 国产农村一一级特黄毛片 | 97精品在线播放 | 女人18毛片一级毛片在线 | 久久精品国产精品亚洲婷婷 | 中文字幕视频在线播放 | 天天曰曰| 高h女 | 在线看的成人性视频 | 日韩成人综合网 | 日日日操 | 四虎 在线播放 | 国产专区日韩精品欧美色 | 在线播放成人毛片免费视 | 日本免费爱爱视频 | 国产精品伊人 | 天天草天天 | 日韩中文字幕在线有码视频网 | 蜜桃日本一道无卡不码高清 | 亚洲精品欧洲久久婷婷99 | 国产美女流白浆的免费视 | 日本成人中文字幕 | 99香蕉国产|