因为工作需要,自己写了一个采集程序,如果冒犯了你的网站,我在这里说一声对不起 !!
哎~!我只是一个普通的程序员 .
namespace CJ
{
public partial class Form1 : Form
{
public int PRoxy = 0;
public int keyi = 0;
public int keyj = 0;
public int keym = 0;
public int keyn = 0;
public int sum = 0;
public string newurl = "";
public string cururl = "";
public string dirname = "";
public string curdir = "";
public string responseFromServer = "";
public string filename = "";
public string sql = "";
public string mulu = "";
StringBuilder sbs = new StringBuilder();
List<Class1> cls = new List<Class1>();
public ArrayList al = new ArrayList();
public string insertdl = "insert into mzinedl values(";
public string insertxl = "insert into mzinexl values(";
public string insertinfo = "insert into mzineinfo values(";
public string insertwz = "insert into mzinewz values(";
public Form1()
{
InitializeComponent();
}
/// <summary>
/// 保存网页
/// </summary>
/// <param name="FILE_NAME">文件的路径</param>
/// <param name="data">数据</param>
public void TextToFile(string FILE_NAME, string data)
{
if (File.Exists(FILE_NAME))
{
return;
}
using (StreamWriter sw = File.CreateText(FILE_NAME))
{
sw.Write(data);
sw.Close();
}
}
/// <summary>
/// 下载文件
/// </summary>
/// <param name="PageUrl">网址</param>
/// <param name="filename">保存文件路径</param>
public void DownFile(string PageUrl, string filename)
{
if (!Directory.Exists(filename))
{
Directory.CreateDirectory(filename);
}
string path = PageUrl.Substring(PageUrl.LastIndexOf("/") + 1);
string dirname = filename + "//" + path;
if (File.Exists(dirname))
{
return;
}
else
{
try
{
WebClient wc = new WebClient();
WebProxy wp = new WebProxy(al[proxy].ToString(), true);
wc.Proxy = wp;
wc.DownloadFile(PageUrl, dirname);
}
catch (WebException ex)
{
if (ex.Status == WebExceptionStatus.ConnectFailure)
{
//无法连接到远程服务器, --换代理 ip
//MessageBox.Show(ex.ToString());
proxy++;
if (proxy >= al.Count)
{
al = ReadIPproxy("e://test.txt");//初始化代理 IP
}
DownFile(PageUrl, filename);
}
else if (ex.Status == WebExceptionStatus.Timeout)
{
//超时 --换代理 IP
//MessageBox.Show(ex.ToString());
proxy++;
if (proxy >= al.Count)
{
al = ReadIPproxy("e://test.txt");//初始化代理 IP
}
DownFile(PageUrl, filename);
}
else if (ex.Status == WebExceptionStatus.ProtocolError)
{
//文件未找到--跳出
//MessageBox.Show(ex.ToString());
return;
}
}
}
}
/// <summary>
/// 读文件
/// </summary>
/// <param name="FILE_NAME">文件的路径</param>
/// <returns>数据</returns>
public ArrayList ReadIPproxy(string FILE_NAME)
{
using (StreamReader sr = File.OpenText(FILE_NAME))
{
String input;
while ((input = sr.ReadLine()) != null)
{
al.Add(input);
}
sr.Close();
}
return al;
}
/// <summary>
/// 数据库
/// </summary>
public void Executesql()
{
SqlHelper.ExecuteNonQuery(SqlHelper.sqlstr, CommandType.Text, sbs.ToString(), null);
}
/// <summary>
/// 读文件
/// </summary>
/// <param name="FILE_NAME">文件的路径</param>
/// <returns>数据</returns>
public string FileToText(string FILE_NAME)
{
string data;
using (StreamReader sr = File.OpenText(FILE_NAME))
{
data=sr.ReadToEnd();
sr.Close();
}
return data;
}
/// <summary>
/// 保存SQL
/// </summary>
/// <param name="sql"></param>
public void SaveSqls(string sql)
{
sbs.Append(sql).Append("/n");
}
/// <summary>
/// 请求失败的时候,反复操作
/// </summary>
/// <param name="PageUrl"></param>
/// <returns></returns>
public string ToServer(string PageUrl)
{
string responseFromServer = "";
try
{
while (1 == 1)
{
WebRequest request = WebRequest.Create(PageUrl);
WebProxy wp = new WebProxy(al[proxy].ToString(), true);
request.Proxy = wp;
request.Timeout = 1000 * 60;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream dataStream = response.GetResponseStream();
StreamReader reader=null;
try
{
reader = new StreamReader(dataStream, System.Text.Encoding.Default);
responseFromServer = reader.ReadToEnd();
}
catch
{
proxy++;
if (proxy >= al.Count)
{
al = ReadIPproxy("e://test.txt");//初始化代理 IP
}
ToServer(PageUrl);
};
reader.Close();
dataStream.Close();
response.Close();
if (responseFromServer.Contains("refresh") || responseFromServer == "")
{
proxy++;
if (proxy >= al.Count)
{
al = ReadIPproxy("e://test.txt");//初始化代理 IP
}
//ToServer(PageUrl);
}
else
{
break;
}
}
}
catch (WebException ex)
{
if (ex.Status == WebExceptionStatus.ProtocolError)
{
responseFromServer = "";
}
else
{
proxy++;
if (proxy >= al.Count)
{
al = ReadIPproxy("e://test.txt");//初始化代理 IP
}
ToServer(PageUrl);
}
}
return responseFromServer;
}
/// <summary>
/// 保存xml 文件
/// </summary>
public void SaveXmls()
{
string pathxml = "";
foreach (Class1 c in cls)
{
Class1 s = c;
pathxml = s.address;
if (!File.Exists(pathxml))
{
XmlSerializer xs = new XmlSerializer(typeof(Class1));
Stream stream = new FileStream(pathxml, FileMode.Create, Fileaccess.Write, FileShare.ReadWrite);
xs.Serialize(stream, s);
stream.Close();
}
}
}
/// <summary>
/// 移除HTMl 标记
/// </summary>
/// <param name="Html"></param>
/// <param name="RegStr"></param>
/// <returns></returns>
public static string Remove(string Html)
{
//Regex Reg = new Regex(RegStr);
//foreach (Match m in Reg.Matches(Html))
//{
// Html = Html.Replace(m.Value, "");
//}
//return Html.Trim();
string regesstr = "<.*?>";
return Regex.Replace(Html, regesstr, string.Empty, RegexOptions.IgnoreCase);
}
public static string FilterScript(string content)
{
string regexstr = @"<(script)[^>]*>(/s*|.)*<//1>";
return Regex.Replace(content,regexstr,string.Empty,RegexOptions.IgnoreCase);
}
/// <summary>
/// 过略所有的 危险标记
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
public string wipeScript(string html)
{
System.Text.RegularExpressions.Regex regex1 = new System.Text.RegularExpressions.Regex(@"(<script){1,}[^<>]*>[^/0]*(<//script>){1,}", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex2 = new System.Text.RegularExpressions.Regex(@"href *= *[/s/S]*script *:", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex3 = new System.Text.RegularExpressions.Regex(@"on[/s/S]*=", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex4 = new System.Text.RegularExpressions.Regex(@"<iframe[/s/S]+</iframe*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex5 = new System.Text.RegularExpressions.Regex(@"<frameset[/s/S]+</frameset*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
html = regex1.Replace(html, ""); //过滤<script></script>标记
html = regex2.Replace(html, ""); //过滤href=javascript: (<A>) 属性
html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件
html = regex4.Replace(html, ""); //过滤iframe
html = regex5.Replace(html, ""); //过滤frameset
return html;
}
public void HtmlSource(string urlpri)
{
//要写入的文件路径
filename = "E://观2//magazine.html";
if (!Directory.Exists("E://观2"))
{
Directory.CreateDirectory("E://观2");
}
if (File.Exists(filename))
{
responseFromServer=FileToText(filename); //存在
}
else
{
responseFromServer = ToServer(urlpri); //不存在
}
sum++;
if (responseFromServer != "")
{
//分析内容
TextToFile(filename,responseFromServer);
MatchCollection mc = Regex.Matches(responseFromServer, @"href=""/magazine/(.*)""><b>(.*)</b>", RegexOptions.IgnoreCase);
foreach (Match m in mc)
{
newurl = m.Groups[1].Value;
dirname = m.Groups[2].Value;
int key = ++keyi;
sql = insertdl + key + ",'" + dirname + "')";
SaveSqls(sql);
cururl = urlpri + newurl;
curdir = "E://观2//" + dirname;
one(cururl, curdir,key);
}
SaveXmls();
Executesql();
this.textBox1.Text = sum.ToString();
MessageBox.Show("采集成功!");
}
}
public void one(string urlpri,string _dirname,int _key)
{
//要写入的文件路径
filename = _dirname +"//"+ urlpri.Substring(urlpri.LastIndexOf("/") + 1);
if (!Directory.Exists(_dirname))
{
Directory.CreateDirectory(_dirname);
}
if (File.Exists(filename))
{
responseFromServer = FileToText(filename);
}
else
{
responseFromServer = ToServer(urlpri);
}
sum++;
if (responseFromServer != "")
{
TextToFile(filename, responseFromServer);
MatchCollection mc = Regex.Matches(responseFromServer, @"href=""/././(.*list.html)""[/s/S]*?《(.*?)》", RegexOptions.IgnoreCase);
foreach (Match m in mc)
{
newurl = m.Groups[1].Value;
dirname = m.Groups[2].Value;
cururl = "http://www.zydg.net/magazine/" + newurl;
curdir = _dirname + "//" + dirname;
two(cururl, curdir, _key);
}
}
}
public void two(string urlpri,string _dirname,int _key)
{
filename = urlpri.Substring(0, urlpri.LastIndexOf("/"));
filename = filename.Substring(filename.LastIndexOf("/") + 1) + ".html";
filename = _dirname + "//" + filename;
if (!Directory.Exists(_dirname))
{
Directory.CreateDirectory(_dirname);
}
if (File.Exists(filename))
{
responseFromServer = FileToText(filename);
}
else
{
responseFromServer = ToServer(urlpri);
}
sum++;
if (responseFromServer != "")
{
TextToFile(filename, responseFromServer);
Match mc = Regex.Match(responseFromServer, @"刊/s+期:(.*?)<br>[/s/S]*?编/s+辑:(.*?)<br>[/s/S]*?出/s+版: (.*?)<br>[/s/S]*?联系电话:(.*?)<br>[/s/S]*?E-mail: (.*?)<br>[/s/S]*?社/s+址:(.*?)<br>[/s/S]*?邮/s+编: (.*?)<br>[/s/S]*?邮发代号:(.*?)<br>[/s/S]*?国外发行代号: (.*?)<br>[/s/S]*?国际标准刊号:(.*?)<br>[/s/S]*?国内统一刊号: (.*?)</td>", RegexOptions.IgnoreCase);
Match content = Regex.Match(responseFromServer, @"刊/s+物/s+简/s+介/s+:::...([/s/S]*?)...:::/s+收录期号列表", RegexOptions.Multiline);
int key = ++keyj;
sql = insertxl + keyj + "," + _key + ",'" + dirname + "','" + mc.Groups[1].Value + "','" + mc.Groups[2].Value + "','" +
mc.Groups[3].Value + "','" + mc.Groups[4].Value + "','" + mc.Groups[5].Value + "','" + mc.Groups[6].Value + "','" +
mc.Groups[7].Value + "','" + mc.Groups[8].Value + "','" + mc.Groups[9].Value + "','" + mc.Groups[10].Value + "','" + mc.Groups[11].Value + "','" + Remove(content.Groups[1].Value) + "')";
SaveSqls(sql);
MatchCollection mc2 = Regex.Matches(responseFromServer, @"href='(.*?)'/s+target.*>(.*?)</a>", RegexOptions.IgnoreCase);
foreach (Match m2 in mc2)
{
newurl = m2.Groups[1].Value;
dirname = m2.Groups[2].Value.Replace("年", "-").Replace("第", "").Replace("期", "");
cururl = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1) + newurl;
curdir = _dirname + "//" + dirname;
three(cururl, curdir,key,dirname);
}
}
}
public void three(string urlpri,string _dirname,int _key,string qishu)
{
//要写入的文件路径
filename = _dirname + "//" + urlpri.Substring(urlpri.LastIndexOf("/") + 1);
if (!Directory.Exists(_dirname))
{
Directory.CreateDirectory(_dirname);
}
if (File.Exists(filename))
{
responseFromServer = FileToText(filename);
}
else
{
responseFromServer = ToServer(urlpri);
}
sum++;
if (responseFromServer != "")
{
TextToFile(filename, responseFromServer);
Match m = Regex.Match(responseFromServer, @"src='face_(.*?)'", RegexOptions.IgnoreCase);
string photoName = "";
if (m.Groups[1].Value.Trim() != "")
{
photoName = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1) +"face_" + m.Groups[1].Value;
DownFile(photoName, _dirname);
}
int key = ++keym;
sql = insertinfo + key + "," + _key + ",'" + qishu + "','" + _dirname +"//"+ "face_" + m.Groups[1].Value + "')";
SaveSqls(sql);
MatchCollection mc2 = Regex.Matches(responseFromServer, @"href='(/d+.html?)'[/s/S]*?<font/s+color=black>(.*?)</a>|& lt;font[^>]*?>[(.+?)]", RegexOptions.IgnoreCase);
foreach (Match m2 in mc2)
{
newurl = m2.Groups[1].Value;
string muName = m2.Groups[3].Value;
if (muName == "")
{
muName = mulu;
}
string lstr = m2.Groups[2].Value;
string s1 = "";
string s2 = "";
if (lstr != "")
{
if (lstr.Contains("."))
{
s1 = lstr.Substring(0, lstr.IndexOf("."));
s2 = lstr.Substring(lstr.LastIndexOf(".") + 1);
}
else
{
s1 = lstr;
s2 = "";
}
int k2 = ++keyn;
sql = insertwz + k2 + "," + key + ",'" + muName + "','" + s1 + "','" + s2 + "')";
SaveSqls(sql);
cururl = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1) + newurl;
curdir = _dirname;
four(cururl, curdir,k2);
}
mulu = muName;
}
}
}
public void four(string urlpri,string _dirname,int _key)
{
filename = _dirname + "//" + urlpri.Substring(urlpri.LastIndexOf("/") + 1);
if (!Directory.Exists(_dirname))
{
Directory.CreateDirectory(_dirname);
}
if (File.Exists(filename))
{
responseFromServer = FileToText(filename);
}
else
{
responseFromServer = ToServer(urlpri);
}
sum++;
if (responseFromServer != "")
{
TextToFile(filename, responseFromServer);
//分析内容
Match m = Regex.Match(responseFromServer, @"正文开始-->(?<text>[/s/S]*?)<!--正文结束", RegexOptions.IgnoreCase);
string content = m.Groups["text"].Value; //得到正文的所有内容
string c = FilterScript(content);
c = Remove(c); //得到过滤后的正文内容
// Match ms = Regex.Match(c, @"正文开始-->(?<text>[/s/S]*?)<!--正文结束", RegexOptions.IgnoreCase);
//设置要保存的XML 文件的名称
string xmlname = urlpri.Substring(urlpri.LastIndexOf("/") + 1, urlpri.LastIndexOf(".") - urlpri.LastIndexOf("/"));
string pathxml = _dirname + "//" + xmlname + "xml"; //将路径 和名字一起传过去
Class1 cs = new Class1(_key, c, pathxml);
cls.Add(cs);
//序列化成功
MatchCollection mc = Regex.Matches(responseFromServer, @"(<img/s+src=""(?<imgs>.*)""/s+hspace|HreF=""([^>]*PDF)"")", RegexOptions.IgnoreCase);
foreach (Match m2 in mc)
{
string imgurl = m2.Groups["imgs"].Value.Trim(); //得到单个图片的名称
string zhuurl = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1);
if (imgurl != "")
{
string jurl = zhuurl + imgurl; //得到图片的绝对路径
DownFile(jurl, _dirname);
}
string pdfurl = m2.Groups["pdfs"].Value.Trim(); //得到单个PDF 的名称
if (pdfurl != "")
{
string jurl = zhuurl + pdfurl; //得到 pdf 的绝对路径
DownFile(jurl, _dirname);
}
}
}
}
private void btnOK_Click(object sender, EventArgs e)
{
al = ReadIPproxy("e://test.txt");//初始化代理 IP
HtmlSource("http://www.zydg.net/magazine/");
}
private void button1_Click(object sender, EventArgs e)
{
application.Exit();
}
}
}
新闻热点
疑难解答