首页 > 编程 > Python > 正文

python实现的一个火车票转让信息采集器

2019-11-25 18:20:10
字体:
来源:转载
供稿:网友

好吧,我承认我是对晚上看到一张合适的票转让但打过电话去说已经被搞走了这件事情感到蛋疼。直接上文件吧。

#coding: utf-8'''春运查询火车票转让信息Author: piglei2007@gmail.comDate: 2011.01.25'''import reimport osimport timeimport urlparseimport datetimeimport tracebackimport urllib2import socketsocket.setdefaulttimeout(20)BLANK_RE = re.compile(r"/s+")opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())opener.addheaders = [  ("User-agent", "Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.9.1) Gecko/20090704 Firefox/3.5"),  ("Accept", "*/*"),]urllib2.install_opener(opener)from BeautifulSoup import BeautifulSoupSOURCE = {  "58": "http://bj.58.com/huochepiao/?Num=%(train)s&StartTime=%(date)s00",  "ganji": "http://bj.ganji.com/piao/cc_%(train)s/%(date)s/",}RECORD_FILE = "/tmp/ticket_records.txt"def parse_record():  try:    return set([x.strip() for x in open(RECORD_FILE, "r").readlines()])  except IOError:    open(RECORD_FILE, "w")    return set()def flush_record(records):  open(RECORD_FILE, "w").write("/n".join(records))def main(config):  """  开始抓取  """  existed = parse_record()  to_email = []  for train in config["trains"]:    for date in config["dates"]:      for type, _url in SOURCE.items():        url = _url % dict(train=train, date=date)        content = urllib2.urlopen(url).read()        soup = BeautifulSoup(content)        result = parse_content(type, soup, train)        for url, text in result:          url = urlparse.urljoin(_url, url)          # 只要卧铺!          if url not in existed and u"卧" in text:            to_email.append([text, url])          existed.add(url)  if to_email:    content = "".join(      [x for x in [" | ".join(y) for y in to_email]]    ).encode("utf-8")    simple_mail(config["people"], content)  flush_record(existed)def parse_content(type, soup, train):  """  获得车次信息  """  result = []  if type == "58":    info_table = soup.find("table", id="infolist")    if info_table:      for x in info_table.findAll("tr", text=re.compile(ur"%s(?!时刻表)" % train, re.I)):        a = x.parent        _text = BLANK_RE.sub("", a.text)        result.append([a["href"], _text])  if type == "ganji":    for x in soup.findAll("dl", {"class": "list_piao"}):      a = x.dt.a      result.append([a["href"], a.text])  return resultEMAIL_HOST = 'smtp.sohu.com'EMAIL_HOST_USER = 'yourname@sohu.com'EMAIL_HOST_PASSWORD = 'yourpassword'EMAIL_PORT = 25def simple_mail(to, content):  """  发送邮件  """  import smtplib  from email.mime.text import MIMEText  msgRoot = MIMEText(content, 'html', 'UTF-8')  msgRoot['Subject'] = "[%s]有票来啦!!!!" % datetime.datetime.today().isoformat(" ")  msgRoot['From'] = EMAIL_HOST_USER  msgRoot['To'] = ", ".join(to)  s = smtplib.SMTP(EMAIL_HOST, EMAIL_PORT)  s.login(EMAIL_HOST_USER, EMAIL_HOST_PASSWORD)  s.sendmail(EMAIL_HOST_USER, to, msgRoot.as_string())  s.close()def switch_time_zone():  """  切换时区  """  os.environ["TZ"] = "Asia/Shanghai"  time.tzset()switch_time_zone()if __name__ == '__main__':  config = {    "trains": ("k471",),    "dates": ("20110129",),    "people": (      "youremail@sohu.com",    )  }  try:    main(config)    print "%s: ok" % datetime.datetime.today()  except Exception, e:    print traceback.format_exc()

然后放入cron,你懂的。

发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表