首页 > 学院 > 开发设计 > 正文

lucene搜索脚手架

2019-11-14 21:50:29
字体:
来源:转载
供稿:网友
lucene搜索脚手架

本文旨在做一个lucene的搜索模板,提供类似于优酷的视频搜索服务。

效果:

基于lucene3.6.1

由于数量少就没有分页,实际上是可以分页的,例子是20个结果一页。

有些重复用相同原理的搜索选项就没弄上去了。

中文分词用的是IKAnalyzer2012.tag的作用在于增加匹配的权重,例子的tag是sfa.

既然是模板,当然主要是用了java设计模式里面的模板方法了。

建索引的模板:

 1 package index; 2  3 import java.io.File; 4 import java.io.IOException; 5 import java.sql.SQLException; 6 import java.text.ParseException; 7 import java.util.List; 8 import java.util.Map; 9 10 import javax.sql.DataSource;11 12 import org.apache.commons.dbutils.QueryRunner;13 import org.apache.commons.dbutils.handlers.MapListHandler;14 import org.apache.lucene.analysis.Analyzer;15 import org.apache.lucene.index.IndexWriter;16 import org.apache.lucene.index.IndexWriterConfig;17 import org.apache.lucene.index.IndexWriterConfig.OpenMode;18 import org.apache.lucene.store.FSDirectory;19 import org.sPRingframework.context.applicationContext;20 import org.springframework.context.support.ClassPathxmlApplicationContext;21 import org.wltea.analyzer.lucene.IKAnalyzer;22 23 import com.mchange.v2.c3p0.ComboPooledDataSource;24 25 public abstract class Abstract_Index_builder implements Index_Builder {26     private Analyzer analyzer = new IKAnalyzer();27 28     public DataSource get_data_source() {29         ApplicationContext ctx = new ClassPathXmlApplicationContext(30                 new String[] { "applicationContext.xml" });//spring配置31         ComboPooledDataSource c3po_ds = (ComboPooledDataSource) ctx32                 .getBean("dataSource");//spring bean,也就是c3po连接池的实例33         return (DataSource) c3po_ds;34     }35 36     public List<Map<String, Object>> do_query(DataSource ds, String sql)37             throws SQLException {38         QueryRunner qr = new QueryRunner();//commons dbutils的queryRunner查询数据库39         List<Map<String, Object>> results = (List<Map<String, Object>>) qr40                 .query(ds.getConnection(), sql, new MapListHandler());//在继承模板的之类中具体实现查询 41         return results;42     }43 44     public abstract void add_to_writer(List<Map<String, Object>> data,45             IndexWriter writer);46 47     public void generate_index(String path, String sql) throws IOException,48             SQLException, ParseException {//开始建索引49         IndexWriterConfig conf = new IndexWriterConfig(50                 org.apache.lucene.util.Version.LUCENE_36, analyzer);51         conf.setOpenMode(OpenMode.CREATE);52         File file = new File(path);53         FSDirectory directory = FSDirectory.open(file);54         IndexWriter writer = new IndexWriter(directory, conf);55         List<Map<String, Object>> data = do_query(get_data_source(), sql);56         add_to_writer(data, writer);//写入57         writer.close();58     }59 }

具体的实现子类:

 1 package index; 2  3 import java.io.IOException; 4 import java.text.ParseException; 5 import java.text.SimpleDateFormat; 6 import java.util.Date; 7 import java.util.List; 8 import java.util.Map; 9 10 import org.apache.lucene.document.Document;11 import org.apache.lucene.document.Field;12 import org.apache.lucene.document.NumericField;13 import org.apache.lucene.document.Field.Index;14 import org.apache.lucene.document.Field.Store;15 import org.apache.lucene.index.IndexWriter;16 17 public class Generate_Video_Index extends Abstract_Index_builder {18 19     @Override20     public void add_to_writer(List<Map<String, Object>> data, IndexWriter writer) {21         SimpleDateFormat sdf = new SimpleDateFormat("yyyy.MM.dd");22         for (int i = 0, len = data.size(); i < len; i++) {23             Document doc = new Document();24             Date date = null;25             try {26                 date = sdf27                         .parse(data.get(i).get("video_uploadtime").toString());28             } catch (ParseException e) {29                 e.printStackTrace();30             }31             Field video_name = new Field("video_name", data.get(i)32                     .get("video_name").toString(), Store.YES, Index.ANALYZED);33             Field tag = new Field("tag", data.get(i).get("tag").toString(),34                     Store.YES, Index.ANALYZED);35             NumericField video_uploadtime = new NumericField(36                     "video_uploadtime", Store.YES, true).setLongValue(date37                     .getTime());//发布时间38             NumericField duration = new NumericField("duration", Store.YES,39                     true).setIntValue((int) data.get(i).get("duration"));//片长40             Field id = new Field("id", data.get(i).get("id").toString(),41                     Store.YES, Index.NOT_ANALYZED);//用户id42             NumericField video_type_id = new NumericField("video_type_id",43                     Store.YES, true).setIntValue((int) data.get(i).get(44                     "video_type_id"));//分类45             NumericField chanel_id = new NumericField("chanel_id", Store.YES,46                     true).setIntValue((int) data.get(i).get("chanel_id"));47             NumericField video_id = new NumericField("video_id", Store.YES,48                     true).setIntValue((int) data.get(i).get("video_id"));49             Field name = new Field("name", data.get(i).get("name").toString(),50                     Store.YES, Index.NOT_ANALYZED);51             Field watch = new Field("watch", data.get(i)52                     .get("video_watchtimes").toString(), Store.YES,53                     Index.NOT_ANALYZED);//播放次数54             doc.add(video_id);55             doc.add(chanel_id);56             doc.add(video_name);57             doc.add(tag);58             doc.add(video_uploadtime);59             doc.add(id);60             doc.add(name);61             doc.add(watch);62             doc.add(duration);63             doc.add(video_type_id);64             try {65                 writer.addDocument(doc);66             } catch (IOException e) {67                 e.printStackTrace();68             }69         }70     }71 }

调用建立索引模板:

实际上不用把需要展示的所有搜索结果项都写入lucene,只用把id之类的写入就可以了。lucene完成查询后取出id再从数据库查询一遍。例子只是为了简单方便才那样做的。

 1 package index; 2  3 import java.io.IOException; 4 import java.sql.SQLException; 5 import java.text.ParseException; 6  7 public class Build_Index { 8     public static void build_index() throws SQLException, IOException, 9             java.text.ParseException {10         Abstract_Index_builder video_index_builder = new Generate_Video_Index();11         video_index_builder12                 .generate_index(13                         "E://lucene/video",14                         "select user.name,vi.video_id,vi.video_name,vi.video_watchtimes,vi.video_uploadtime,vi.tag,vi.id,vi.duration,type.video_type_id,vi.chanel_id "15                                 + "from video_info vi inner join user as "16                                 + "user on vi.id=user.id inner join video_sub_type as sub_type on sub_type.video_sub_type_id=vi.video_sub_type_id inner "17                                 + "join video_type as type on type.video_type_id=sub_type.video_type_id");18     }19 20     public static void main(String[] args) throws SQLException, IOException,21             ParseException {22         Build_Index.build_index();23     }24 }

搜索模板:

 1 package search; 2  3 import java.io.File; 4 import java.io.IOException; 5 import java.util.List; 6 import java.util.Map; 7  8 import javax.servlet.http.HttpServletRequest; 9 10 import org.apache.lucene.analysis.Analyzer;11 import org.apache.lucene.index.IndexReader;12 import org.apache.lucene.search.IndexSearcher;13 import org.apache.lucene.search.Query;14 import org.apache.lucene.search.ScoreDoc;15 import org.apache.lucene.search.TopDocs;16 import org.apache.lucene.search.TopDocsCollector;17 import org.apache.lucene.search.TopScoreDocCollector;18 import org.apache.lucene.search.highlight.Formatter;19 import org.apache.lucene.search.highlight.Fragmenter;20 import org.apache.lucene.search.highlight.Highlighter;21 import org.apache.lucene.search.highlight.QueryScorer;22 import org.apache.lucene.search.highlight.Scorer;23 import org.apache.lucene.search.highlight.SimpleFragmenter;24 import org.apache.lucene.search.highlight.SimpleHTMLFormatter;25 import org.apache.lucene.store.FSDirectory;26 import org.apache.struts2.interceptor.ServletRequestAware;27 import org.wltea.analyzer.lucene.IKAnalyzer;28 29 import com.opensymphony.xwork2.ActionSupport;30 31 public abstract class Abstract_Search_builder extends ActionSupport implements32         Search_Builder, ServletRequestAware {//struts233     protected Analyzer analyzer = new IKAnalyzer();34     public HttpServletRequest request;35     protected int count;36 37     public int getCount() {38         return count;39     }40 41     public void setCount(int count) {42         this.count = count;43     }44 45     public abstract List<Map<String, Object>> get_search_result(46             ScoreDoc[] docs, Highlighter highlighter, IndexSearcher search,47             int count);//获取搜索结果写入request48 49     public void setServletRequest(HttpServletRequest request) {50         this.request = request;51     }52 53     public List<Map<String, Object>> do_search(int start, int offset,54             TopDocsCollector c, Query query1, String path) throws IOException {55         FSDirectory d = FSDirectory.open(new File(path));56         IndexReader r = IndexReader.open(d);57         IndexSearcher search = new IndexSearcher(r);58         if (c == null) {//不是分页请求59             c = TopScoreDocCollector.create(search.maxDoc(), false);60             offset = search.maxDoc();61             start = 0;62         }63         search.search(query1, c);64         int count = c.getTotalHits();65         ScoreDoc[] docs = c.topDocs(offset * start, offset).scoreDocs;66         Formatter formatter = new SimpleHTMLFormatter("<font color='red'>",67                 "</font>");68         Scorer fragmentScore = new QueryScorer(query1);69         Highlighter highlighter = new Highlighter(formatter, fragmentScore);70         Fragmenter fragmenter = new SimpleFragmenter(100);71         highlighter.setTextFragmenter(fragmenter);72         List<Map<String, Object>> list = get_search_result(docs, highlighter,73                 search, count);74         search.close();75         return list;76     }77 }

搜索的实现子类:

  1 package search;  2   3 import java.io.IOException;  4 import java.sql.SQLException;  5 import java.text.ParseException;  6 import java.util.ArrayList;  7 import java.util.Date;  8 import java.util.HashMap;  9 import java.util.List; 10 import java.util.Map; 11  12 import org.apache.lucene.index.CorruptIndexException; 13 import org.apache.lucene.queryParser.QueryParser; 14 import org.apache.lucene.search.BooleanClause; 15 import org.apache.lucene.search.BooleanQuery; 16 import org.apache.lucene.search.IndexSearcher; 17 import org.apache.lucene.search.Query; 18 import org.apache.lucene.search.ScoreDoc; 19 import org.apache.lucene.search.Sort; 20 import org.apache.lucene.search.SortField; 21 import org.apache.lucene.search.TopDocsCollector; 22 import org.apache.lucene.search.TopFieldCollector; 23 import org.apache.lucene.search.TopScoreDocCollector; 24 import org.apache.lucene.search.highlight.Highlighter; 25 import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; 26 import org.apache.lucene.util.Version; 27  28 import output.CommonOutput; 29 import output.Strategy; 30  31 import util.LuceneUtil; 32  33 public class Video_Search extends Abstract_Search_builder { 34     private String Word; 35     private String tag; 36     private int start; 37     private int order_type; 38     private int limit_date; 39     private Album_Search album_search; 40     private Map<String, Object> info = null; 41     private int offset = 20; 42     private int watch_filter; 43     private int time_filter; 44     private int type_filter; 45  46     public void setAlbum_search(Album_Search album_search) { 47         this.album_search = album_search; 48     } 49  50     public int getLimit_date() { 51         return limit_date; 52     } 53  54     public void setLimit_date(int limit_date) { 55         this.limit_date = limit_date; 56     } 57  58     public int getWatch_filter() { 59         return watch_filter; 60     } 61  62     public void setWatch_filter(int watch_filter) { 63         this.watch_filter = watch_filter; 64     } 65  66     public int getTime_filter() { 67         return time_filter; 68     } 69  70     public void setTime_filter(int time_filter) { 71         this.time_filter = time_filter; 72     } 73  74     public int getType_filter() { 75         return type_filter; 76     } 77  78     public void setType_filter(int type_filter) { 79         this.type_filter = type_filter; 80     } 81  82     public int getOrder_type() { 83         return order_type; 84     } 85  86     public void setOrder_type(int order_type) { 87         this.order_type = order_type; 88     } 89  90     public int getStart() { 91         return start; 92     } 93  94     public void setStart(int start) { 95         this.start = start; 96     } 97  98     public String getWord() { 99         return word;100     }101 102     public void setWord(String word) {103         this.word = word;104     }105 106     public String getTag() {107         return tag;108     }109 110     public void setTag(String tag) {111         this.tag = tag;112     }113 114     public Query complex_video_query(String w, String t, Query q3, Query q4,115             Query q5) throws ParseException,116             org.apache.lucene.queryParser.ParseException {117         QueryParser qp = new QueryParser(Version.LUCENE_36, "video_name",118                 analyzer);119         Query q1 = qp.parse(w);120         QueryParser parser = new QueryParser(Version.LUCENE_36, "tag", analyzer);121         Query q2 = parser.parse(t);122         BooleanQuery query1 = new BooleanQuery();123         query1.add(q1, BooleanClause.Occur.MUST);124         query1.add(q2, BooleanClause.Occur.SHOULD);125         LuceneUtil.build_bool_query(query1, q3);126         LuceneUtil.build_bool_query(query1, q4);127         LuceneUtil.build_bool_query(query1, q5);//将查询条件组合128         return query1;129     }130 131     public String video_search() throws IOException, ParseException,132             InvalidTokenOffsetsException, NumberFormatException,133             java.text.ParseException, SQLException,134             org.apache.lucene.queryParser.ParseException {//请求从这里进入135         Sort sort = null;136         TopDocsCollector c = null;137         Query q3 = null;138         Query q4 = null;139         Query q5 = null;140         int max_d = 0;141         int min_d = 0;142         switch (getOrder_type()) {//排序143         case 1:144             sort = new Sort(new SortField("video_uploadtime", SortField.LONG,145                     true));146             break;147         case 2:148             sort = new Sort(new SortField("watch", SortField.INT, true));149             break;150         default:151             break;152         }153         if (sort == null)154             c = TopScoreDocCollector155                     .create(offset * getStart() + offset, false);156         else157             c = TopFieldCollector.create(sort, offset * getStart() + offset,158                     false, false, false, false);159         switch (getWatch_filter()) {//结果过滤160         case 1:161             max_d = 10800;162             min_d = 3600;163             break;164         case 2:165             max_d = 3600;166             min_d = 1800;167             break;168         case 3:169             max_d = 1800;170             min_d = 600;171             break;172         case 4:173             max_d = 600;174             min_d = 0;175             break;176         default:177             max_d = 0;178             min_d = 0;179             break;180         }181         if (getWatch_filter() < 5 && getWatch_filter() > 0) {182             q3 = LuceneUtil.int_range_query(min_d, max_d, "duration");183         }184         if (getTime_filter() == 1) {185             Date before_date = LuceneUtil.getDateBefore(new Date(),186                     getLimit_date());187             q4 = LuceneUtil.long_range_query(before_date.getTime(),188                     new Date().getTime(), "video_uploadtime");189         }190         if (getType_filter() != 0)191             q5 = LuceneUtil.int_range_query(getType_filter(), getType_filter(),192                     "video_type_id");193         Query query1 = complex_video_query(getWord(), getTag(), q3, q4, q5);194         List<Map<String, Object>> list = do_search(getStart(), offset, c,195                 query1, "E://lucene/video");196         Strategy strategy1 = new CommonOutput();197         strategy1.output(list, request, info, "result");198         request.setAttribute("q", getWord());199         request.setAttribute("s", getOrder_type());200         request.setAttribute("wf", getWatch_filter());201         request.setAttribute("timef", getTime_filter());202         request.setAttribute("typef", getType_filter());203         request.setAttribute("ld", getLimit_date());204         request.setAttribute("cur", getStart());205         request.setAttribute("count", Math.ceil(this.getCount() / offset));206         return "success";207     }208 209     @Override210     public List<Map<String, Object>> get_search_result(ScoreDoc[] docs,211             Highlighter highlighter, IndexSearcher search, int count) {212         this.setCount(count);213         List<Map<String, Object>> list = new ArrayList<Map<String, Object>>();214         for (int i = 0; i < docs.length; i++) {215             int sd = docs[i].doc;216             Map<String, Object> map = new HashMap<String, Object>();217             String video_name1 = null;218             String video_name = null;219             String name = null;220             String tag1 = null;221             String tag = null;222             String duration = null;223             String id = null;224             String watch = null;225             String video_type_id = null;226             String video_id = null;227             try {228                 video_name1 = search.doc(sd).get("video_name");229                 if (video_name1.length() > 12)230                     video_name1 = video_name1.substring(0, 12) + "..";231                 tag1 = search.doc(sd).get("tag");232                 duration = search.doc(sd).get("duration");233                 id = search.doc(sd).get("id");234                 video_id = search.doc(sd).get("video_id");235                 name = search.doc(sd).get("name");236                 watch = search.doc(sd).get("watch");237                 video_type_id = search.doc(sd).get("video_type_id");238             } catch (CorruptIndexException e1) {239                 e1.printStackTrace();240             } catch (IOException e1) {241                 e1.printStackTrace();242             }243             try {244                 video_name = highlighter.getBestFragment(analyzer,245                         "video_name", video_name1);246                 tag = highlighter.getBestFragment(analyzer, "tag", tag1);247             } catch (IOException | InvalidTokenOffsetsException e1) {248                 e1.printStackTrace();249             }250             if (tag == null)251                 tag = tag1;252             if (video_name == null)253                 video_name = video_name1;254             map.put("duration",255                     LuceneUtil.cal_duration(Integer.parseInt(duration)));256             // try {257             // map.put("video_uploadtime",258             // Long.parseLong(search.doc(sd).get("video_uploadtime")));259             // } catch (NumberFormatException | IOException e) {260             // // TODO Auto-generated catch block261             // e.printStackTrace();262             // }263             try {264                 map.put("duration",265                         LuceneUtil.cal_duration(Integer.parseInt(duration)));266                 map.put("video_uploadtime",267                         LuceneUtil.dateDiff(268                                 Long.parseLong(search.doc(sd).get(269                                         "video_uploadtime")),270                                 new Date().getTime()));271             } catch (NumberFormatException | ParseException | IOException e) {272                 e.printStackTrace();273             }274             map.put("video_id", video_id);275             map.put("tag", tag);276             map.put("video_name", video_name);277             map.put("id", id);278             map.put("name", name);279             map.put("watch", watch);280             map.put("video_type_id", video_type_id);281             list.add(map);282         }283         return list;284     }285 }

工具类:

  1 package util;  2   3 import java.sql.SQLException;  4 import java.util.Calendar;  5 import java.util.Date;  6 import java.util.List;  7 import java.util.Map;  8 import java.util.Set;  9  10 import javax.sql.DataSource; 11  12 import org.apache.commons.dbutils.QueryRunner; 13 import org.apache.commons.dbutils.handlers.MapListHandler; 14 import org.apache.lucene.search.BooleanClause; 15 import org.apache.lucene.search.BooleanQuery; 16 import org.apache.lucene.search.NumericRangeQuery; 17 import org.apache.lucene.search.Query; 18 import org.json.JSONArray; 19 import org.json.JSONObject; 20 import org.springframework.context.ApplicationContext; 21 import org.springframework.context.support.ClassPathXmlApplicationContext; 22  23 import com.mchange.v2.c3p0.ComboPooledDataSource; 24  25 public class LuceneUtil { 26     public static String toJson(List<Map<String, Object>> list) 27             throws Exception {//转换成json 28         JSONArray array = new JSONArray(); 29         JSONObject member = null; 30         for (Map<String, Object> map : list) { 31             member = new JSONObject(); 32             Set<Map.Entry<String, Object>> sets = map.entrySet(); 33             for (Map.Entry<String, Object> entry : sets) { 34                 member.put((String) entry.getKey(), (Object) entry.getValue()); 35             } 36             array.put(member); 37         } 38         return array.toString(); 39     } 40  41     public static Query build_bool_query(BooleanQuery q1, Query q2) { 42         if (q2 != null) 43             q1.add(q2, BooleanClause.Occur.MUST); 44         return q1; 45     } 46  47     public static String cal_duration(int second) {//片长格式化 48         String h = "0"; 49         String d = "0"; 50         String s = "0"; 51         int temp = second % 3600; 52         if (second > 3600) { 53             h = String.valueOf(second / 3600); 54             if (Integer.parseInt(h) < 10) { 55                 h = "0" + h; 56             } 57             if (temp != 0) { 58                 if (temp > 60) { 59                     d = String.valueOf(temp / 60); 60                     if (Integer.parseInt(d) < 10) { 61                         d = "0" + d; 62                     } 63                     if (temp % 60 != 0) { 64                         s = String.valueOf(temp % 60); 65                         if (Integer.parseInt(s) < 10) { 66                             s = "0" + s; 67                         } 68                     } 69                 } else { 70                     s = String.valueOf(temp); 71                     d = "00"; 72                     if (Integer.parseInt(s) < 10) { 73                         s = "0" + s; 74                     } 75                 } 76             } else { 77                 d = "00"; 78                 s = "00"; 79             } 80         } else { 81             h = "00"; 82             d = String.valueOf(second / 60); 83             if (Integer.parseInt(d) < 10) { 84                 d = "0" + d; 85             } 86             if (Integer.parseInt(d) % 60 == 0) { 87  88                 h = String.valueOf((Integer.parseInt(d) / 60)); 89                 if (Integer.parseInt(h) < 10) { 90                     h = "0" + h; 91                 } 92                 d = "00"; 93             } 94             s = "00"; 95             if (second % 60 != 0) { 96                 s = String.valueOf(second % 60); 97                 if (Integer.parseInt(s) < 10) { 98                     s = "0" + s; 99                 }100             }101         }102         if (h == "00")103             return d + ":" + s;104         if (d == "00")105             return s;106         return h + ":" + d + ":" + s;107     }108 109     public static String dateDiff(long startTime, long endTime)110             throws java.text.ParseException {//将上传时间转换为多少日(月,年)前111         long nd = 1000 * 24 * 60 * 60;112         long diff = endTime - startTime;113         long day = diff / nd;114         if (day > 30 && day < 365)115             return day / 30 + "月前";116         if (0 <= day && day <= 30)117             return day + "天前";118         if (day > 365)119             return day / 365 + "年前";120         return null;121     }122     //范围查询123     public static Query long_range_query(final long min, final long max,124             final String field) {125         return NumericRangeQuery.newLongRange(field, min, max, true, true);126     }127 128     public static Query int_range_query(int min, int max, final String field) {129         return NumericRangeQuery.newIntRange(field, min, max, true, true);130     }131 132     public static Date getDateBefore(Date d, int day) {133         Calendar now = Calendar.getInstance();134         now.setTime(d);135         now.set(Calendar.DATE, now.get(Calendar.DATE) - day);136         return now.getTime();137     }138 139     public static DataSource get_data_source() {140         ApplicationContext ctx = new ClassPathXmlApplicationContext(141                 new String[] { "applicationContext.xml" });142         ComboPooledDataSource c3po_ds = (ComboPooledDataSource) ctx143                 .getBean("dataSource");144         return (DataSource) c3po_ds;145     }146 147     public static List<Map<String, Object>> simple_query(DataSource ds,148             String sql, Object[] params) throws SQLException {149         QueryRunner qr = new QueryRunner();150         List<Map<String, Object>> results = (List<Map<String, Object>>) qr151                 .query(ds.getConnection(), sql, new MapListHandler(), params);152         return results;153     }154 }

个人感觉优酷的视频搜索应该用的就是lucene或者solr构建的,因为它的搜索域的session cookie中有jsessionid,这个是即便手动从后台代码中显示更换名字都是不行,因为它默认就在servelet中给你加了,除非去改servelet的源码

另外,lucene建索引的速度比sphinx慢很多,而查询速度也比sphinx慢一点。所以如果应用的大部分不是构建在java上或者应用不大,还是用sphinx就已经可以很好的满足上面那些查询条件的需求了。

转载请注明:TheViper http://www.VEVb.com/TheViper


发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表