首页 > 学院 > 开发设计 > 正文

Java 根据关键字抓取google 新闻 网络数据

2019-11-17 04:02:28
字体:
来源:转载
供稿:网友
用户要求统计所提供关键字在网络出现的新闻,下面为一个测试的main方法。

package com.net;

import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* @{#} NetTools.java Create on Nov 18, 2009 4:55:57 PM
*
* Copyright (c) 2009 by ThinkIT
* @author Jack He ,jackhexl@Gmail.com
* @version 1.0
*/

public class NetTools {

PRivate String url = "";// 请求的URL
private String keyWord = "";// 搜索的关键字
private StringBuffer strBuffer = new StringBuffer("");
private List newsList=new ArrayList();//新闻数组

public static void main(String [] args){
  List list=new ArrayList();
  NetTools nt=new NetTools();
  try {
   list=nt.getNews("", new String[]{"环保局"});
   for(int i=0;i<list.size();i++){
    System.out.println(list.get(i).toString());
   }
   
  } catch (IOException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  }
}
public List getNews(String url,String[] keywords) throws IOException{
  if(url.equals("")||null==url)
   url="http://news.google.cn/news/search?cf=all&scoring=n&pz=1&cf=all&ned=ccn&hl=zh-CN&q=";
  int i=0;
  this.url=url;
  //关键字
  for(;i<keywords.length;i++){
   this.keyword+=keywords[i]+" ";   
  }
  this.url+=java.net.URLEncoder.encode(this.keyword,"UTF-8");// 带参数的请求地址
  System.out.println("请求地址为:"+this.url);
  URL requestUrl=new URL(this.url);
  // 打开链接
  HttpURLConnection connection = (HttpURLConnection) requestUrl.openConnection();
  connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
  connection.connect();

  InputStream is=connection.getInputStream();
  String content;
        while ((is.read()) != -1)   
        {   
            int all = is.available();   
            byte[] b = new byte[all];   
            is.read(b);   
            strBuffer.append(new String(b, "UTF-8"));   
        }
        if(is!=null) is.close();
        content=strBuffer.toString();
  Pattern regexContent = Pattern.compile("<h2 class=/"title/">*</h2>",
    Pattern.CASE_INSENSITIVE);
  Matcher mcContent = regexContent.matcher(content);
  while (mcContent.find()) {
   String news = mcContent.group();
   newsList.add(news);
  }

  return newsList;
}
}
发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表