本文旨在做一个lucene的搜索模板,提供类似于优酷的视频搜索服务。
效果:
基于lucene3.6.1
由于数量少就没有分页,实际上是可以分页的,例子是20个结果一页。
有些重复用相同原理的搜索选项就没弄上去了。
中文分词用的是IKAnalyzer2012.tag的作用在于增加匹配的权重,例子的tag是sfa.
既然是模板,当然主要是用了java设计模式里面的模板方法了。
建索引的模板:
1 package index; 2 3 import java.io.File; 4 import java.io.IOException; 5 import java.sql.SQLException; 6 import java.text.ParseException; 7 import java.util.List; 8 import java.util.Map; 9 10 import javax.sql.DataSource;11 12 import org.apache.commons.dbutils.QueryRunner;13 import org.apache.commons.dbutils.handlers.MapListHandler;14 import org.apache.lucene.analysis.Analyzer;15 import org.apache.lucene.index.IndexWriter;16 import org.apache.lucene.index.IndexWriterConfig;17 import org.apache.lucene.index.IndexWriterConfig.OpenMode;18 import org.apache.lucene.store.FSDirectory;19 import org.sPRingframework.context.applicationContext;20 import org.springframework.context.support.ClassPathxmlApplicationContext;21 import org.wltea.analyzer.lucene.IKAnalyzer;22 23 import com.mchange.v2.c3p0.ComboPooledDataSource;24 25 public abstract class Abstract_Index_builder implements Index_Builder {26 private Analyzer analyzer = new IKAnalyzer();27 28 public DataSource get_data_source() {29 ApplicationContext ctx = new ClassPathXmlApplicationContext(30 new String[] { "applicationContext.xml" });//spring配置31 ComboPooledDataSource c3po_ds = (ComboPooledDataSource) ctx32 .getBean("dataSource");//spring bean,也就是c3po连接池的实例33 return (DataSource) c3po_ds;34 }35 36 public List<Map<String, Object>> do_query(DataSource ds, String sql)37 throws SQLException {38 QueryRunner qr = new QueryRunner();//commons dbutils的queryRunner查询数据库39 List<Map<String, Object>> results = (List<Map<String, Object>>) qr40 .query(ds.getConnection(), sql, new MapListHandler());//在继承模板的之类中具体实现查询 41 return results;42 }43 44 public abstract void add_to_writer(List<Map<String, Object>> data,45 IndexWriter writer);46 47 public void generate_index(String path, String sql) throws IOException,48 SQLException, ParseException {//开始建索引49 IndexWriterConfig conf = new IndexWriterConfig(50 org.apache.lucene.util.Version.LUCENE_36, analyzer);51 conf.setOpenMode(OpenMode.CREATE);52 File file = new File(path);53 FSDirectory directory = FSDirectory.open(file);54 IndexWriter writer = new IndexWriter(directory, conf);55 List<Map<String, Object>> data = do_query(get_data_source(), sql);56 add_to_writer(data, writer);//写入57 writer.close();58 }59 }
具体的实现子类:
1 package index; 2 3 import java.io.IOException; 4 import java.text.ParseException; 5 import java.text.SimpleDateFormat; 6 import java.util.Date; 7 import java.util.List; 8 import java.util.Map; 9 10 import org.apache.lucene.document.Document;11 import org.apache.lucene.document.Field;12 import org.apache.lucene.document.NumericField;13 import org.apache.lucene.document.Field.Index;14 import org.apache.lucene.document.Field.Store;15 import org.apache.lucene.index.IndexWriter;16 17 public class Generate_Video_Index extends Abstract_Index_builder {18 19 @Override20 public void add_to_writer(List<Map<String, Object>> data, IndexWriter writer) {21 SimpleDateFormat sdf = new SimpleDateFormat("yyyy.MM.dd");22 for (int i = 0, len = data.size(); i < len; i++) {23 Document doc = new Document();24 Date date = null;25 try {26 date = sdf27 .parse(data.get(i).get("video_uploadtime").toString());28 } catch (ParseException e) {29 e.printStackTrace();30 }31 Field video_name = new Field("video_name", data.get(i)32 .get("video_name").toString(), Store.YES, Index.ANALYZED);33 Field tag = new Field("tag", data.get(i).get("tag").toString(),34 Store.YES, Index.ANALYZED);35 NumericField video_uploadtime = new NumericField(36 "video_uploadtime", Store.YES, true).setLongValue(date37 .getTime());//发布时间38 NumericField duration = new NumericField("duration", Store.YES,39 true).setIntValue((int) data.get(i).get("duration"));//片长40 Field id = new Field("id", data.get(i).get("id").toString(),41 Store.YES, Index.NOT_ANALYZED);//用户id42 NumericField video_type_id = new NumericField("video_type_id",43 Store.YES, true).setIntValue((int) data.get(i).get(44 "video_type_id"));//分类45 NumericField chanel_id = new NumericField("chanel_id", Store.YES,46 true).setIntValue((int) data.get(i).get("chanel_id"));47 NumericField video_id = new NumericField("video_id", Store.YES,48 true).setIntValue((int) data.get(i).get("video_id"));49 Field name = new Field("name", data.get(i).get("name").toString(),50 Store.YES, Index.NOT_ANALYZED);51 Field watch = new Field("watch", data.get(i)52 .get("video_watchtimes").toString(), Store.YES,53 Index.NOT_ANALYZED);//播放次数54 doc.add(video_id);55 doc.add(chanel_id);56 doc.add(video_name);57 doc.add(tag);58 doc.add(video_uploadtime);59 doc.add(id);60 doc.add(name);61 doc.add(watch);62 doc.add(duration);63 doc.add(video_type_id);64 try {65 writer.addDocument(doc);66 } catch (IOException e) {67 e.printStackTrace();68 }69 }70 }71 }
调用建立索引模板:
实际上不用把需要展示的所有搜索结果项都写入lucene,只用把id之类的写入就可以了。lucene完成查询后取出id再从数据库查询一遍。例子只是为了简单方便才那样做的。
1 package index; 2 3 import java.io.IOException; 4 import java.sql.SQLException; 5 import java.text.ParseException; 6 7 public class Build_Index { 8 public static void build_index() throws SQLException, IOException, 9 java.text.ParseException {10 Abstract_Index_builder video_index_builder = new Generate_Video_Index();11 video_index_builder12 .generate_index(13 "E://lucene/video",14 "select user.name,vi.video_id,vi.video_name,vi.video_watchtimes,vi.video_uploadtime,vi.tag,vi.id,vi.duration,type.video_type_id,vi.chanel_id "15 + "from video_info vi inner join user as "16 + "user on vi.id=user.id inner join video_sub_type as sub_type on sub_type.video_sub_type_id=vi.video_sub_type_id inner "17 + "join video_type as type on type.video_type_id=sub_type.video_type_id");18 }19 20 public static void main(String[] args) throws SQLException, IOException,21 ParseException {22 Build_Index.build_index();23 }24 }
搜索模板:
1 package search; 2 3 import java.io.File; 4 import java.io.IOException; 5 import java.util.List; 6 import java.util.Map; 7 8 import javax.servlet.http.HttpServletRequest; 9 10 import org.apache.lucene.analysis.Analyzer;11 import org.apache.lucene.index.IndexReader;12 import org.apache.lucene.search.IndexSearcher;13 import org.apache.lucene.search.Query;14 import org.apache.lucene.search.ScoreDoc;15 import org.apache.lucene.search.TopDocs;16 import org.apache.lucene.search.TopDocsCollector;17 import org.apache.lucene.search.TopScoreDocCollector;18 import org.apache.lucene.search.highlight.Formatter;19 import org.apache.lucene.search.highlight.Fragmenter;20 import org.apache.lucene.search.highlight.Highlighter;21 import org.apache.lucene.search.highlight.QueryScorer;22 import org.apache.lucene.search.highlight.Scorer;23 import org.apache.lucene.search.highlight.SimpleFragmenter;24 import org.apache.lucene.search.highlight.SimpleHTMLFormatter;25 import org.apache.lucene.store.FSDirectory;26 import org.apache.struts2.interceptor.ServletRequestAware;27 import org.wltea.analyzer.lucene.IKAnalyzer;28 29 import com.opensymphony.xwork2.ActionSupport;30 31 public abstract class Abstract_Search_builder extends ActionSupport implements32 Search_Builder, ServletRequestAware {//struts233 protected Analyzer analyzer = new IKAnalyzer();34 public HttpServletRequest request;35 protected int count;36 37 public int getCount() {38 return count;39 }40 41 public void setCount(int count) {42 this.count = count;43 }44 45 public abstract List<Map<String, Object>> get_search_result(46 ScoreDoc[] docs, Highlighter highlighter, IndexSearcher search,47 int count);//获取搜索结果写入request48 49 public void setServletRequest(HttpServletRequest request) {50 this.request = request;51 }52 53 public List<Map<String, Object>> do_search(int start, int offset,54 TopDocsCollector c, Query query1, String path) throws IOException {55 FSDirectory d = FSDirectory.open(new File(path));56 IndexReader r = IndexReader.open(d);57 IndexSearcher search = new IndexSearcher(r);58 if (c == null) {//不是分页请求59 c = TopScoreDocCollector.create(search.maxDoc(), false);60 offset = search.maxDoc();61 start = 0;62 }63 search.search(query1, c);64 int count = c.getTotalHits();65 ScoreDoc[] docs = c.topDocs(offset * start, offset).scoreDocs;66 Formatter formatter = new SimpleHTMLFormatter("<font color='red'>",67 "</font>");68 Scorer fragmentScore = new QueryScorer(query1);69 Highlighter highlighter = new Highlighter(formatter, fragmentScore);70 Fragmenter fragmenter = new SimpleFragmenter(100);71 highlighter.setTextFragmenter(fragmenter);72 List<Map<String, Object>> list = get_search_result(docs, highlighter,73 search, count);74 search.close();75 return list;76 }77 }
搜索的实现子类:
1 package search; 2 3 import java.io.IOException; 4 import java.sql.SQLException; 5 import java.text.ParseException; 6 import java.util.ArrayList; 7 import java.util.Date; 8 import java.util.HashMap; 9 import java.util.List; 10 import java.util.Map; 11 12 import org.apache.lucene.index.CorruptIndexException; 13 import org.apache.lucene.queryParser.QueryParser; 14 import org.apache.lucene.search.BooleanClause; 15 import org.apache.lucene.search.BooleanQuery; 16 import org.apache.lucene.search.IndexSearcher; 17 import org.apache.lucene.search.Query; 18 import org.apache.lucene.search.ScoreDoc; 19 import org.apache.lucene.search.Sort; 20 import org.apache.lucene.search.SortField; 21 import org.apache.lucene.search.TopDocsCollector; 22 import org.apache.lucene.search.TopFieldCollector; 23 import org.apache.lucene.search.TopScoreDocCollector; 24 import org.apache.lucene.search.highlight.Highlighter; 25 import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; 26 import org.apache.lucene.util.Version; 27 28 import output.CommonOutput; 29 import output.Strategy; 30 31 import util.LuceneUtil; 32 33 public class Video_Search extends Abstract_Search_builder { 34 private String Word; 35 private String tag; 36 private int start; 37 private int order_type; 38 private int limit_date; 39 private Album_Search album_search; 40 private Map<String, Object> info = null; 41 private int offset = 20; 42 private int watch_filter; 43 private int time_filter; 44 private int type_filter; 45 46 public void setAlbum_search(Album_Search album_search) { 47 this.album_search = album_search; 48 } 49 50 public int getLimit_date() { 51 return limit_date; 52 } 53 54 public void setLimit_date(int limit_date) { 55 this.limit_date = limit_date; 56 } 57 58 public int getWatch_filter() { 59 return watch_filter; 60 } 61 62 public void setWatch_filter(int watch_filter) { 63 this.watch_filter = watch_filter; 64 } 65 66 public int getTime_filter() { 67 return time_filter; 68 } 69 70 public void setTime_filter(int time_filter) { 71 this.time_filter = time_filter; 72 } 73 74 public int getType_filter() { 75 return type_filter; 76 } 77 78 public void setType_filter(int type_filter) { 79 this.type_filter = type_filter; 80 } 81 82 public int getOrder_type() { 83 return order_type; 84 } 85 86 public void setOrder_type(int order_type) { 87 this.order_type = order_type; 88 } 89 90 public int getStart() { 91 return start; 92 } 93 94 public void setStart(int start) { 95 this.start = start; 96 } 97 98 public String getWord() { 99 return word;100 }101 102 public void setWord(String word) {103 this.word = word;104 }105 106 public String getTag() {107 return tag;108 }109 110 public void setTag(String tag) {111 this.tag = tag;112 }113 114 public Query complex_video_query(String w, String t, Query q3, Query q4,115 Query q5) throws ParseException,116 org.apache.lucene.queryParser.ParseException {117 QueryParser qp = new QueryParser(Version.LUCENE_36, "video_name",118 analyzer);119 Query q1 = qp.parse(w);120 QueryParser parser = new QueryParser(Version.LUCENE_36, "tag", analyzer);121 Query q2 = parser.parse(t);122 BooleanQuery query1 = new BooleanQuery();123 query1.add(q1, BooleanClause.Occur.MUST);124 query1.add(q2, BooleanClause.Occur.SHOULD);125 LuceneUtil.build_bool_query(query1, q3);126 LuceneUtil.build_bool_query(query1, q4);127 LuceneUtil.build_bool_query(query1, q5);//将查询条件组合128 return query1;129 }130 131 public String video_search() throws IOException, ParseException,132 InvalidTokenOffsetsException, NumberFormatException,133 java.text.ParseException, SQLException,134 org.apache.lucene.queryParser.ParseException {//请求从这里进入135 Sort sort = null;136 TopDocsCollector c = null;137 Query q3 = null;138 Query q4 = null;139 Query q5 = null;140 int max_d = 0;141 int min_d = 0;142 switch (getOrder_type()) {//排序143 case 1:144 sort = new Sort(new SortField("video_uploadtime", SortField.LONG,145 true));146 break;147 case 2:148 sort = new Sort(new SortField("watch", SortField.INT, true));149 break;150 default:151 break;152 }153 if (sort == null)154 c = TopScoreDocCollector155 .create(offset * getStart() + offset, false);156 else157 c = TopFieldCollector.create(sort, offset * getStart() + offset,158 false, false, false, false);159 switch (getWatch_filter()) {//结果过滤160 case 1:161 max_d = 10800;162 min_d = 3600;163 break;164 case 2:165 max_d = 3600;166 min_d = 1800;167 break;168 case 3:169 max_d = 1800;170 min_d = 600;171 break;172 case 4:173 max_d = 600;174 min_d = 0;175 break;176 default:177 max_d = 0;178 min_d = 0;179 break;180 }181 if (getWatch_filter() < 5 && getWatch_filter() > 0) {182 q3 = LuceneUtil.int_range_query(min_d, max_d, "duration");183 }184 if (getTime_filter() == 1) {185 Date before_date = LuceneUtil.getDateBefore(new Date(),186 getLimit_date());187 q4 = LuceneUtil.long_range_query(before_date.getTime(),188 new Date().getTime(), "video_uploadtime");189 }190 if (getType_filter() != 0)191 q5 = LuceneUtil.int_range_query(getType_filter(), getType_filter(),192 "video_type_id");193 Query query1 = complex_video_query(getWord(), getTag(), q3, q4, q5);194 List<Map<String, Object>> list = do_search(getStart(), offset, c,195 query1, "E://lucene/video");196 Strategy strategy1 = new CommonOutput();197 strategy1.output(list, request, info, "result");198 request.setAttribute("q", getWord());199 request.setAttribute("s", getOrder_type());200 request.setAttribute("wf", getWatch_filter());201 request.setAttribute("timef", getTime_filter());202 request.setAttribute("typef", getType_filter());203 request.setAttribute("ld", getLimit_date());204 request.setAttribute("cur", getStart());205 request.setAttribute("count", Math.ceil(this.getCount() / offset));206 return "success";207 }208 209 @Override210 public List<Map<String, Object>> get_search_result(ScoreDoc[] docs,211 Highlighter highlighter, IndexSearcher search, int count) {212 this.setCount(count);213 List<Map<String, Object>> list = new ArrayList<Map<String, Object>>();214 for (int i = 0; i < docs.length; i++) {215 int sd = docs[i].doc;216 Map<String, Object> map = new HashMap<String, Object>();217 String video_name1 = null;218 String video_name = null;219 String name = null;220 String tag1 = null;221 String tag = null;222 String duration = null;223 String id = null;224 String watch = null;225 String video_type_id = null;226 String video_id = null;227 try {228 video_name1 = search.doc(sd).get("video_name");229 if (video_name1.length() > 12)230 video_name1 = video_name1.substring(0, 12) + "..";231 tag1 = search.doc(sd).get("tag");232 duration = search.doc(sd).get("duration");233 id = search.doc(sd).get("id");234 video_id = search.doc(sd).get("video_id");235 name = search.doc(sd).get("name");236 watch = search.doc(sd).get("watch");237 video_type_id = search.doc(sd).get("video_type_id");238 } catch (CorruptIndexException e1) {239 e1.printStackTrace();240 } catch (IOException e1) {241 e1.printStackTrace();242 }243 try {244 video_name = highlighter.getBestFragment(analyzer,245 "video_name", video_name1);246 tag = highlighter.getBestFragment(analyzer, "tag", tag1);247 } catch (IOException | InvalidTokenOffsetsException e1) {248 e1.printStackTrace();249 }250 if (tag == null)251 tag = tag1;252 if (video_name == null)253 video_name = video_name1;254 map.put("duration",255 LuceneUtil.cal_duration(Integer.parseInt(duration)));256 // try {257 // map.put("video_uploadtime",258 // Long.parseLong(search.doc(sd).get("video_uploadtime")));259 // } catch (NumberFormatException | IOException e) {260 // // TODO Auto-generated catch block261 // e.printStackTrace();262 // }263 try {264 map.put("duration",265 LuceneUtil.cal_duration(Integer.parseInt(duration)));266 map.put("video_uploadtime",267 LuceneUtil.dateDiff(268 Long.parseLong(search.doc(sd).get(269 "video_uploadtime")),270 new Date().getTime()));271 } catch (NumberFormatException | ParseException | IOException e) {272 e.printStackTrace();273 }274 map.put("video_id", video_id);275 map.put("tag", tag);276 map.put("video_name", video_name);277 map.put("id", id);278 map.put("name", name);279 map.put("watch", watch);280 map.put("video_type_id", video_type_id);281 list.add(map);282 }283 return list;284 }285 }
工具类:
1 package util; 2 3 import java.sql.SQLException; 4 import java.util.Calendar; 5 import java.util.Date; 6 import java.util.List; 7 import java.util.Map; 8 import java.util.Set; 9 10 import javax.sql.DataSource; 11 12 import org.apache.commons.dbutils.QueryRunner; 13 import org.apache.commons.dbutils.handlers.MapListHandler; 14 import org.apache.lucene.search.BooleanClause; 15 import org.apache.lucene.search.BooleanQuery; 16 import org.apache.lucene.search.NumericRangeQuery; 17 import org.apache.lucene.search.Query; 18 import org.json.JSONArray; 19 import org.json.JSONObject; 20 import org.springframework.context.ApplicationContext; 21 import org.springframework.context.support.ClassPathXmlApplicationContext; 22 23 import com.mchange.v2.c3p0.ComboPooledDataSource; 24 25 public class LuceneUtil { 26 public static String toJson(List<Map<String, Object>> list) 27 throws Exception {//转换成json 28 JSONArray array = new JSONArray(); 29 JSONObject member = null; 30 for (Map<String, Object> map : list) { 31 member = new JSONObject(); 32 Set<Map.Entry<String, Object>> sets = map.entrySet(); 33 for (Map.Entry<String, Object> entry : sets) { 34 member.put((String) entry.getKey(), (Object) entry.getValue()); 35 } 36 array.put(member); 37 } 38 return array.toString(); 39 } 40 41 public static Query build_bool_query(BooleanQuery q1, Query q2) { 42 if (q2 != null) 43 q1.add(q2, BooleanClause.Occur.MUST); 44 return q1; 45 } 46 47 public static String cal_duration(int second) {//片长格式化 48 String h = "0"; 49 String d = "0"; 50 String s = "0"; 51 int temp = second % 3600; 52 if (second > 3600) { 53 h = String.valueOf(second / 3600); 54 if (Integer.parseInt(h) < 10) { 55 h = "0" + h; 56 } 57 if (temp != 0) { 58 if (temp > 60) { 59 d = String.valueOf(temp / 60); 60 if (Integer.parseInt(d) < 10) { 61 d = "0" + d; 62 } 63 if (temp % 60 != 0) { 64 s = String.valueOf(temp % 60); 65 if (Integer.parseInt(s) < 10) { 66 s = "0" + s; 67 } 68 } 69 } else { 70 s = String.valueOf(temp); 71 d = "00"; 72 if (Integer.parseInt(s) < 10) { 73 s = "0" + s; 74 } 75 } 76 } else { 77 d = "00"; 78 s = "00"; 79 } 80 } else { 81 h = "00"; 82 d = String.valueOf(second / 60); 83 if (Integer.parseInt(d) < 10) { 84 d = "0" + d; 85 } 86 if (Integer.parseInt(d) % 60 == 0) { 87 88 h = String.valueOf((Integer.parseInt(d) / 60)); 89 if (Integer.parseInt(h) < 10) { 90 h = "0" + h; 91 } 92 d = "00"; 93 } 94 s = "00"; 95 if (second % 60 != 0) { 96 s = String.valueOf(second % 60); 97 if (Integer.parseInt(s) < 10) { 98 s = "0" + s; 99 }100 }101 }102 if (h == "00")103 return d + ":" + s;104 if (d == "00")105 return s;106 return h + ":" + d + ":" + s;107 }108 109 public static String dateDiff(long startTime, long endTime)110 throws java.text.ParseException {//将上传时间转换为多少日(月,年)前111 long nd = 1000 * 24 * 60 * 60;112 long diff = endTime - startTime;113 long day = diff / nd;114 if (day > 30 && day < 365)115 return day / 30 + "月前";116 if (0 <= day && day <= 30)117 return day + "天前";118 if (day > 365)119 return day / 365 + "年前";120 return null;121 }122 //范围查询123 public static Query long_range_query(final long min, final long max,124 final String field) {125 return NumericRangeQuery.newLongRange(field, min, max, true, true);126 }127 128 public static Query int_range_query(int min, int max, final String field) {129 return NumericRangeQuery.newIntRange(field, min, max, true, true);130 }131 132 public static Date getDateBefore(Date d, int day) {133 Calendar now = Calendar.getInstance();134 now.setTime(d);135 now.set(Calendar.DATE, now.get(Calendar.DATE) - day);136 return now.getTime();137 }138 139 public static DataSource get_data_source() {140 ApplicationContext ctx = new ClassPathXmlApplicationContext(141 new String[] { "applicationContext.xml" });142 ComboPooledDataSource c3po_ds = (ComboPooledDataSource) ctx143 .getBean("dataSource");144 return (DataSource) c3po_ds;145 }146 147 public static List<Map<String, Object>> simple_query(DataSource ds,148 String sql, Object[] params) throws SQLException {149 QueryRunner qr = new QueryRunner();150 List<Map<String, Object>> results = (List<Map<String, Object>>) qr151 .query(ds.getConnection(), sql, new MapListHandler(), params);152 return results;153 }154 }
个人感觉优酷的视频搜索应该用的就是lucene或者solr构建的,因为它的搜索域的session cookie中有jsessionid,这个是即便手动从后台代码中显示更换名字都是不行,因为它默认就在servelet中给你加了,除非去改servelet的源码。
另外,lucene建索引的速度比sphinx慢很多,而查询速度也比sphinx慢一点。所以如果应用的大部分不是构建在java上或者应用不大,还是用sphinx就已经可以很好的满足上面那些查询条件的需求了。
转载请注明:TheViper http://www.VEVb.com/TheViper
新闻热点
疑难解答