最近学了下网络爬虫,打算从一个网站上提取点东西,自己练练手,刚开始还从这个网站上取了正确的html,后来百般尝试还是不能取正确的html,希望能得到大家的帮助~
我刚开始的代码是:
1     url="http://www.karger.com/Collections/Hospital";2     data = urllib.request.urlopen(url).read();3     data=data.decode('gb2312');4     data=BeautifulSoup(data);5     PRint(data);后来改成下面这样:
 1 url="http://www.karger.com/Collections/Hospital"; 2  3 headers = [('Host','www.karger.com'), 4             ('Connection', 'keep-alive'),  5             ('Cache-Control', 'max-age=0'), 6              ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'), 7               ('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0'),  8               ('Accept-Encoding','gzip, deflate'), 9                ('Accept-Language', 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'), 10                ('If-None-Match', '90101f995236651aa74454922de2ad74'), 11                ('Referer','"http://www.karger.com/Collections/Hospital"'),12                ('If-Modified-Since', 'Thu, 01 Jan 1970 00:00:00 GMT')] 13 opener = urllib.request.build_opener() 14 opener.addheaders = headers 15 data = opener.open(url).read();16 print(data)  得到的执行结果都是下面这样:
b'<html>/r/n<head>/r/n<META NAME="robots" CONTENT="noindex,nofollow">/r/n<script>/r/n(function() {  function getsessionCookies() {   cookieArray = new Array();   var cName = /^//s?incap_ses_/;   var c = document.cookie.split(";");   for (var i = 0; i < c.length; i++) {    key = c[i].substr(0, c[i].indexOf("="));    value = c[i].substr(c[i].indexOf("=") + 1, c[i].length);    if (cName.test(key)) {     cookieArray[cookieArray.length] = value    }   }   return cookieArray  }  function setIncapCookie(vArray) {   try {    cookies = getSessionCookies();    digests = new Array(cookies.length);    for (var i = 0; i < cookies.length; i++) {     digests[i] = simpleDigest((vArray) + cookies[i])    }    res = vArray + ",digest=" + (digests.join())   } catch (e) {    res = vArray + ",digest=" + (encodeURIComponent(e.toString()))   }   createCookie("___utmvc", res, 20)  }  function simpleDigest(mystr) {   var res = 0;   for (var i = 0; i < mystr.length; i++) {    res += mystr.charCodeAt(i)   }   return res  }  function createCookie(name, value, seconds) {   if (seconds) {    var date = new Date();    date.setTime(date.getTime() + (seconds * 1000));    var expires = "; expires=" + date.toGMTString()   } else {    var expires = ""   }   document.cookie = name + "=" + value + expires + "; path=/"  }  function test(o) {   var res = "";   var vArray = new Array();   for (var j = 0; j < o.length; j++) {    var test = o[j][0]    switch (o[j][1]) {    case "exists_boolean":     try { /t if(typeof(eval(test)) != "undefined"){ /t/tvArray[vArray.length] = encodeURIComponent(test + "=true") /t } /t else{ /t/tvArray[vArray.length] = encodeURIComponent(test + "=false") /t }     } catch (e) {      vArray[vArray.length] = encodeURIComponent(test + "=false")     }     break;    case "exists":     try {      vArray[vArray.length] = encodeURIComponent(test + "=" + typeof(eval(test)))     } catch (e) {      vArray[vArray.length] = encodeURIComponent(test + "=" + e)     }     break;    case "value":     try {      vArray[vArray.length] = encodeURIComponent(test + "=" + eval(test).toString())     } catch (e) {      vArray[vArray.length] = encodeURIComponent(test + "="