简单的java采集程序二

2019-11-14 21:03:09

字体：大中小

来源：转载

供稿：网友

简单的java采集程序二

继【简单的java采集程序】，这里将完成对整个网站的号码段的采集任务。

【使用预编译+批处理采集网页内容至数据库表中】

在之前我们用statement类来创建sql语句的执行对象，以实现插入字段到数据库的操作，但由于插入的数据量较大，如果继续用statement操作话，会很耗时间，我们用其子类PReparedStatement来进行操作。

PreparedStatement 可以实现sql语句的预编译，我们只需要通过其setString()方法传参即可，这样不仅效率提高了，而且也会更加安全，可防止SQL注入。推荐相关文章

另外我们还可以调用其addBatch()方法和 executeBatch()实现批量插入处理。

代码如下，喜欢把数据库链接作为一个单独的类

import java.sql.DriverManager;import java.sql.SQLException;import com.MySQL.jdbc.Connection;public class database {public static String driver ="com.mysql.jdbc.Driver";public static String url="jdbc:mysql://127.0.0.1:3306/tele_dat?autoReconnect=true&characterEncoding=UTF-8";public static String user   ="root";public static String passWord = "123456";public static java.sql.Connection conn = null;//返回一个数据库连接对象public static  Connection  ConnectToDataBase(){try {Class.forName(driver);} catch (ClassNotFoundException e) {System.out.println("加载驱动失败");e.printStackTrace();}try {conn = DriverManager.getConnection(url, user, password);System.out.println("连接成功");} catch (SQLException e) {System.out.println("连接出问题了");e.printStackTrace();}return (Connection) conn;}//测试连接数据库public static void main(String args[]){database.ConnectToDataBase();}}

主体程序

import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.net.URL;import java.sql.Connection;import java.sql.PreparedStatement;import java.util.regex.Matcher;import java.util.regex.Pattern;public class Crawl {private static Connection conn = database.ConnectToDataBase();      static String home_url = "http://www.hiphop8.com";   //网站首页        static String pattern_pro_city = "<DIV class=title><SPAN>(.*?) - (.*?)<///SPAN><///DIV>";   //匹配省名，市名 static String pattern_number = ">(13//d{5}|15//d{5}|18//d{5}|147//d{4})<";  //匹配号码段 static String pattern_pro ="//w{3}//.//w{7}//.//w{3}/////w{4}/////w+";   //省份URLstatic String pattern_city_hz="<LI><A href=/"(.*?)/" target=_blank>";    //城市URL的后缀 //编译预处理相关选项 static String insertSQL = "insert ignore into number_segment(segment,province,city) values(?, ?, ?)";    static PreparedStatement pst = null; static int num_pro = 0;static int num_city=0;    static int all_num_tele = 0;public static void  main(String[] args) throws Exception {String PreStat =  "insert ignore into number_segment(segment,province,city) values (?,?,?) ";  pst = conn.prepareStatement(PreStat.toString()); Matcher mat_home = get(home_url,pattern_pro);long start = System.currentTimeMillis();while(mat_home.find()){num_pro++;System.out.println("------第"+num_pro+"个省-----");String city_url_qz = "http://"+mat_home.group()+"/";int len = city_url_qz.length();//这里换成StringBuffer来最字符串进行相加处理StringBuffer city_ur = new StringBuffer();city_ur.append(city_url_qz);Matcher mat_city_hz = get(city_url_qz,pattern_city_hz);while(mat_city_hz.find())                  //通过拼接获得 城市的完整url{num_city++;System.out.println("第"+num_city+"个市");String last_city_url=city_ur.append(mat_city_hz.group(1)).toString();//String last_city_url = city_url_qz + mat_city_hz.group(1);int len2 = last_city_url.length();One_City_Tele_to_DB(last_city_url);city_ur.delete(len,len2);}}long end = System.currentTimeMillis();long time = (end-start)/(1000*60);conn.close();System.out.println("查询到的电话号码段总数量："+all_num_tele);System.out.println("花费的时间是:"+time);} public  static void One_City_Tele_to_DB(String url) throws Exception    {       int this_city_num=0;    String pro = null;    String city = null;     Matcher mat_pro_city = get(url,pattern_pro_city);  //获取省名字  市名字 while(mat_pro_city.find()) { String long_pro = mat_pro_city.group(1);     pro = long_pro.substring(0, long_pro.length()-1); String long_city = mat_pro_city.group(2);     city = long_city.substring(0, long_city.length()-10); System.out.println("省份:"+pro+"  "+"城市："+city+"     正在插入号码段进数据库");  }  Matcher mat_number = get(url,pattern_number);      //获取号码段 while(mat_number.find()) { pst.setString(1,mat_number.group(1)); pst.setString(2, pro); pst.setString(3, city); pst.addBatch(); this_city_num++; all_num_tele++; } pst.executeBatch();  //每次批量插入一个城市的号码段 pst.clearBatch(); System.out.println("该市插入的号码段的数量是："+ this_city_num);} //正则匹配public static  Matcher get(String str_url, String pattern) throws Exception {        String urlsource = get_Html(str_url);Pattern p = Pattern.compile(pattern);Matcher m = p.matcher(urlsource);return m;}//获取网页内容public static String get_Html(String str_url) throws IOException{    URL url = new URL(str_url);    String content="";StringBuffer page = new StringBuffer();try {BufferedReader in = new BufferedReader(new InputStreamReader(url                    .openStream()));while((content = in.readLine()) != null){page.append(content);}} catch (IOException e) {e.printStackTrace();}        return page.toString();    }}

实际运行程序，发现有500多个重复的号码段，因为襄樊市改成襄阳市，这两个市的号码段全部一样，而数据库表中是以segment(号码)作为主键，所以要设置，当插入有相同主键的sql语句时，自动忽略跳过，方法就是在insert 后面加上ignore就可以了。

另外设置id为auto_increment，但如果把数据表里的数据清空之后，id不会从1重新开始，这时只要在mysql命令行下输入 truncate table table_name 就可以实现id从1开始了。

运行结果截图

显然，6分钟的速度还是太慢了，后面试了几次都是在6~8分钟之内（不过相对于不用批处理而直接用statement已经快很多了）。因此还得想办法优化。

【使用智能批量处理+StringBuilder的append()方法】

在插入mysql数据库表中时，可以以insert ignore into number_segment(segment,province,city) values(?, ?, ?)，vaulues(?)，values(?,?,?)…的形式进行批量插入，上面使用setString()方法传参还是太慢了，直接用StringBuilder的append()方法吧，我们都知道appedn()方法进行字符串相加处理的效率是很高的，所以我们每次在insert语句后面加入一段“vaulues(?，？，？)，”这样形式的字符串，然后批处理插入，这里通过变量all_tele_num进行批量处理的的控制，我们规定每次批量执行2000个数据段插入。

直接看代码吧

import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.net.URL;import java.sql.Connection;import java.sql.PreparedStatement;import java.util.regex.Matcher;import java.util.regex.Pattern;public class SecondCrawl {private static Connection conn = database.ConnectToDataBase();//预编译 + StringBuilderstatic StringBuilder PreStat = new StringBuilder(); static String Qz = "insert ignore into number_segment(segment,province,city) values"; static String insertSQL = "insert ignore into number_segment(segment,province,city) values(?, ?, ?)";    static int len1 = Qz.length();    static PreparedStatement pst = null;    static String home_url = "http://www.hiphop8.com";        static String pattern_pro_city = "<DIV class=title><SPAN>(.*?) - (.*?)<///SPAN><///DIV>";   //匹配省名，市名 static String pattern_number = ">(13//d{5}|15//d{5}|18//d{5}|147//d{4})<";  //匹配号码段 static String pattern_pro ="//w{3}//.//w{7}//.//w{3}/////w{4}/////w+";   //省份URLstatic String pattern_city_hz="<LI><A href=/"(.*?)/" target=_blank>";    //城市URL的后缀 static int num_pro = 0;static int num_city=0;static int all_num_tele=0;public static void  main(String[] args) throws Exception {Matcher mat_home = get(home_url,pattern_pro); conn.setAutoCommit(true);  PreStat.append(Qz);  pst = conn.prepareStatement(insertSQL);   //预编译long start = System.currentTimeMillis();while(mat_home.find()){num_pro++;System.out.println("------第"+num_pro+"个省-----");String city_url_qz = "http://"+mat_home.group()+"/";int len = city_url_qz.length();StringBuffer city_ur = new StringBuffer();city_ur.append(city_url_qz);Matcher mat_city_hz = get(city_url_qz,pattern_city_hz);while(mat_city_hz.find())        //获得城市的url{num_city++;System.out.println("第"+num_city+"个市");String city_url=city_ur.append(mat_city_hz.group(1)).toString();int len2 = city_url.length();One_City_Tele_to_DB(city_url);city_ur.delete(len,len2);}}long end = System.currentTimeMillis();long time = (end-start)/(1000*60);pst.executeBatch();   //批处理执行最后面剩余的部分conn.close();System.out.println("查询到的电话号码段总数量："+all_num_tele);System.out.println("花费的时间是:"+time+"分多钟/n"+"以微秒为单位："+(end-start)+"微秒");}//一个城市的手机号码段处理函数public  static void One_City_Tele_to_DB(String url) throws Exception    { String city=null; String pro =null;    int this_city_num = 0; Matcher mat_pro_city = get(url,pattern_pro_city);  while(mat_pro_city.find()) { String long_pro = mat_pro_city.group(1); pro = long_pro.substring(0, long_pro.length()-1); String long_city = mat_pro_city.group(2); city = long_city.substring(0, long_city.length()-10);  System.out.println("省份:"+pro+"  "+"城市："+city+"     正在插入号码段进数据库...");  }  String temp = ",'"+pro+"','"+city+"'),"; Matcher mat_number = get(url,pattern_number); while(mat_number.find()) { PreStat.append("("+mat_number.group(1)).append(temp); this_city_num++; all_num_tele++; if(all_num_tele<=208000 && all_num_tele % 2000==0) { PreStat.deleteCharAt(PreStat.length()-1);   //除去sql语句后的逗号 pst.addBatch(PreStat.toString());  pst.executeBatch(); pst.clearBatch(); PreStat.delete(len1, PreStat.length());   //情况sql语句后面部分以释放空间 } } if(all_num_tele>208000)             //后面不足2000部分的城市加入批处理中先不执行 { PreStat.deleteCharAt(PreStat.length()-1); pst.addBatch(PreStat.toString()); PreStat.delete(len1, PreStat.length()); } System.out.println("该市插入的号码段的数量是："+ this_city_num);} //正则匹配public static  Matcher get(String str_url, String pattern) throws Exception {       String urlsource = get_Html(str_url);Pattern p = Pattern.compile(pattern);Matcher m = p.matcher(urlsource);return m;}//获取网页内容public static String get_Html(String str_url) throws IOException{   URL url = new URL(str_url);   String content="";StringBuffer page = new StringBuffer();try {BufferedReader in = new BufferedReader(new InputStreamReader(url                   .openStream()));while((content = in.readLine()) != null){page.append(content);}} catch (IOException e) {e.printStackTrace();}       return page.toString();   }}

运行截图 030641D5CD414D92B15219D95CAFC302