首页 > 开发 > Java > 正文

java实现登录之后抓取数据

2024-07-14 08:41:45
字体:
来源:转载
供稿:网友

最近做了一个从网络上抓取数据的一个小程序。主要关于信贷方面,收集的一些黑名单网站,从该网站上抓取到自己系统中。

也找了一些资料,觉得没有一个很好的,全面的例子。因此在这里做个笔记提醒自己。

首先需要一个jsoup的jar包,我用的1.6.0。。下载地址为:http://pan.baidu.com/s/1mgqOuHa

1,获取网页内容(核心代码,技术有限没封装)。

2,登录之后抓取网页数据(如何在请求中携带cookie)。

3,获取网站的ajax请求方法(返回json)。

以上这三点我就用一个类全部包含(比较糙望见谅,直接copy代码过去,应该就可以用)

一,这个类分别有这上面的1,2,3三中方法,直接main方法可以进行测试

package com.minxinloan.black.web.utils;import java.io.BufferedReader;import java.io.ByteArrayOutputStream;import java.io.DataInputStream;import java.io.DataOutputStream;import java.io.File;import java.io.FileOutputStream;import java.io.FileWriter;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.OutputStream;import java.io.PrintWriter;import java.net.HttpURLConnection;import java.net.URL;import java.net.URLConnection;import java.net.URLEncoder;import java.nio.charset.Charset;import java.util.ArrayList;import java.util.HashMap;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.Map.Entry;import java.util.StringTokenizer;import net.sf.json.JSONArray;import net.sf.json.JSONObject;import org.jsoup.Connection;import org.jsoup.Connection.Method;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class CookieUtil {  public final static String CONTENT_TYPE = "Content-Type";  public static void main(String[] args) {        //String loginURL = "http://www.p2peye.com/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=Lsc66&username=puqiuxiaomao&password=a1234567";    String listURL = "http://www.p2peye.com/blacklist.php?p=2";    String logURL = "http://www.p2peye.com/member.php";    //********************************需要登录的*************************************************    try {        Connection.Response res =             Jsoup.connect(logURL)              .data("mod","logging"                  ,"action","login"                  ,"loginsubmit","yes"                  ,"loginhash","Lsc66"                  ,"username","puqiuxiaomao"                  ,"password","a1234567")              .method(Method.POST)              .execute();                        //这儿的SESSIONID需要根据要登录的目标网站设置的session Cookie名字而定        Connection con=Jsoup.connect(listURL);        //设置访问形式(电脑访问,手机访问):直接百度都参数设置        con.header("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)");        //把登录信息的cookies保存如map对象里面        Map <String,String> map=res.cookies();        Iterator<Entry<String,String>> it =map.entrySet().iterator();        while(it.hasNext()){          Entry<String,String> en= it.next();           //把登录的信息放入请求里面          con =con.cookie(en.getKey(), en.getValue());                  }        //再次获取Document对象。        Document objectDoc = con.get();                Elements elements = objectDoc.getAllElements();//获取这个连接返回页面的源码内容(不是源码跟源码差不多)        for (Element element : elements) {          //element是迭代出来的标签:如:<div><span></span></div>          Elements elements2= element.getAllElements();//           for (Element element2 : elements2) {             element2.text();             element2.attr("href");//获取标签属性。element2代表a标签:href代表属性             element2.text();//获取标签文本          }        }                //********************************不需要登录的*************************************************                String URL = "http://www.p2peye.com/blacklist.php?p=2";        Document conTemp = Jsoup.connect(URL).get();        Elements elementsTemps = conTemp.getAllElements();         for (Element elementsTemp : elementsTemps) {           elementsTemp.text();           elementsTemp.attr("href");//获取标签属性。element2代表a标签:href代表属性           elementsTemp.text();//获取标签文本        }                        //********************************ajax方法获取内容。。。*************************************************。         HttpURLConnection connection = null;          BufferedReader reader = null;          try {            StringBuffer sb = new StringBuffer();            URL getUrl = new URL(URL);            connection = (HttpURLConnection)getUrl.openConnection();            reader = new BufferedReader(new InputStreamReader(                connection.getInputStream(),"utf-8"));            String lines;            while ((lines = reader.readLine()) != null) {              sb.append(lines);            };            List<Map<String, Object>> list = parseJSON2List(sb.toString());//json转换成list          } catch (Exception e) {                      } finally{            if(reader!=null)              try {                reader.close();              } catch (IOException e) {              }            // 断开连接            connection.disconnect();          }            } catch (IOException e) {      // TODO Auto-generated catch block      e.printStackTrace();    }      }    public static Map<String, Object> parseJSON2Map(String jsonStr){     Map<String, Object> map = new HashMap<String, Object>();     //最外层解析     JSONObject json = JSONObject.fromObject(jsonStr);     for(Object k : json.keySet()){       Object v = json.get(k);        //如果内层还是数组的话,继续解析       if(v instanceof JSONArray){         List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();         Iterator<JSONObject> it = ((JSONArray)v).iterator();         while(it.hasNext()){           JSONObject json2 = it.next();           list.add(parseJSON2Map(json2.toString()));         }         map.put(k.toString(), list);       } else {         map.put(k.toString(), v);       }     }     return map;   }     public static List<Map<String, Object>> parseJSON2List(String jsonStr){     JSONArray jsonArr = JSONArray.fromObject(jsonStr);     List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();     Iterator<JSONObject> it = jsonArr.iterator();     while(it.hasNext()){       JSONObject json2 = it.next();       list.add(parseJSON2Map(json2.toString()));     }     return list;   }     }

二,这个是获取验证码的类,可以研究下。(但你要要分析出网站的验证码的请求地址)

package com.minxinloan.black.web.utils;import java.io.BufferedReader;import java.io.DataInputStream;import java.io.DataOutputStream;import java.io.File;import java.io.FileOutputStream;import java.io.FileWriter;import java.io.InputStream;import java.io.InputStreamReader;import java.io.PrintWriter;import java.net.HttpURLConnection;import java.net.URL;import java.net.URLConnection;import java.nio.charset.Charset;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.StringTokenizer;public class Utils {//解析验证码的  public static Content getRandom(String method, String sUrl,// 要解析的url      Map<String, String> paramMap, // 存放用户名和密码的map      Map<String, String> requestHeaderMap,// 存放COOKIE的map      boolean isOnlyReturnHeader, String path) {    Content content = null;    HttpURLConnection httpUrlConnection = null;    InputStream in = null;    try {      URL url = new URL(sUrl);      boolean isPost = "POST".equals(method);      if (method == null          || (!"GET".equalsIgnoreCase(method) && !"POST"              .equalsIgnoreCase(method))) {        method = "POST";      }      URL resolvedURL = url;      URLConnection urlConnection = resolvedURL.openConnection();      httpUrlConnection = (HttpURLConnection) urlConnection;      httpUrlConnection.setRequestMethod(method);      httpUrlConnection.setRequestProperty("Accept-Language",          "zh-cn,zh;q=0.5");      // Do not follow redirects, We will handle redirects ourself      httpUrlConnection.setInstanceFollowRedirects(false);      httpUrlConnection.setDoOutput(true);      httpUrlConnection.setDoInput(true);      httpUrlConnection.setConnectTimeout(5000);      httpUrlConnection.setReadTimeout(5000);      httpUrlConnection.setUseCaches(false);      httpUrlConnection.setDefaultUseCaches(false);      httpUrlConnection.connect();      int responseCode = httpUrlConnection.getResponseCode();      if (responseCode == HttpURLConnection.HTTP_OK          || responseCode == HttpURLConnection.HTTP_CREATED) {        byte[] bytes = new byte[0];        if (!isOnlyReturnHeader) {          DataInputStream ins = new DataInputStream(              httpUrlConnection.getInputStream());          // 验证码的位置          DataOutputStream out = new DataOutputStream(              new FileOutputStream(path + "/code.bmp"));          byte[] buffer = new byte[4096];          int count = 0;          while ((count = ins.read(buffer)) > 0) {            out.write(buffer, 0, count);          }          out.close();          ins.close();        }        String encoding = null;        if (encoding == null) {          encoding = getEncodingFromContentType(httpUrlConnection              .getHeaderField(""));        }        content = new Content(sUrl, new String(bytes, encoding),            httpUrlConnection.getHeaderFields());      }    } catch (Exception e) {      return null;    } finally {      if (httpUrlConnection != null) {        httpUrlConnection.disconnect();      }    }    return content;  }  public static String getEncodingFromContentType(String contentType) {    String encoding = null;    if (contentType == null) {      return null;    }    StringTokenizer tok = new StringTokenizer(contentType, ";");    if (tok.hasMoreTokens()) {      tok.nextToken();      while (tok.hasMoreTokens()) {        String assignment = tok.nextToken().trim();        int eqIdx = assignment.indexOf('=');        if (eqIdx != -1) {          String varName = assignment.substring(0, eqIdx).trim();          if ("charset".equalsIgnoreCase(varName)) {            String varValue = assignment.substring(eqIdx + 1)                .trim();            if (varValue.startsWith("/"")                && varValue.endsWith("/"")) {              // substring works on indices              varValue = varValue.substring(1,                  varValue.length() - 1);            }            if (Charset.isSupported(varValue)) {              encoding = varValue;            }          }        }      }    }    if (encoding == null) {      return "UTF-8";    }    return encoding;  }  // 这个是输出  public static boolean inFile(String content, String path) {    PrintWriter out = null;    File file = new File(path);    try {      if (!file.exists()) {        file.createNewFile();      }      out = new PrintWriter(new FileWriter(file));      out.write(content);      out.flush();      return true;    } catch (Exception e) {      e.printStackTrace();    } finally {      out.close();    }    return false;  }  public static String getHtmlReadLine(String httpurl) {    String CurrentLine = "";    String TotalString = "";    InputStream urlStream;    String content = "";    try {      URL url = new URL(httpurl);      HttpURLConnection connection = (HttpURLConnection) url          .openConnection();      connection.connect();      System.out.println(connection.getResponseCode());      urlStream = connection.getInputStream();      BufferedReader reader = new BufferedReader(      new InputStreamReader(urlStream, "utf-8"));      while ((CurrentLine = reader.readLine()) != null) {        TotalString += CurrentLine + "/n";      }      content = TotalString;    } catch (Exception e) {    }    return content;  }}class Content {  private String url;  private String body;  private Map<String, List<String>> m_mHeaders = new HashMap<String, List<String>>();  public Content(String url, String body, Map<String, List<String>> headers) {    this.url = url;    this.body = body;    this.m_mHeaders = headers;  }  public String getUrl() {    return url;  }  public String getBody() {    return body;  }  public Map<String, List<String>> getHeaders() {    return m_mHeaders;  }}


注:相关教程知识阅读请移步到JAVA教程频道。
发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表