爬取某pygame技术博客

听说lxml 性能比 beautsoup 强,所以测试下, 玩玩这个东西怎么样
需要用到的包: https://pypi.python.org/pypi/lxml, 这里下载适合自己的版本, 然后跑下面的测试代码

# -*- coding: utf-8 -*-
from urllib2 import urlopen,Request
import urllib
from lxml import *
import lxml.html as HTML
import time
def error(txt):
    with open("../it/error.txt","a") as f:
        f.write(txt + '\n')
def con(url,count=4):
    try:
        req = Request(url)
        req.add_header('Referer','http://www.baidu.com')
        req.add_header('User-Agent','Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
        res = urlopen(req,timeout = 20)
        page = res.read()
        res.close()
        #dom = HTML.document_fromstring(page)
        return page
    except Exception,e:
        if count >= 10:
            print e
            error(url)
        else:
            count += 1
            time.sleep(1)
            return con(url,count)
def menu(url):
    page = con(url)
    dom = HTML.document_fromstring(page)
    path = "//h5/a"
    node = dom.xpath(path)
    for n in node:
        dic = {}
        dic['title'] = n.text_content()
        dic['url'] = "http:" + n.get("href")
        if dic['title'] and dic['url']:
            yield dic
def save(title,content):
    with open('../it/'+unicode(title)+'.html','w') as f:
        f.write(content)
def blog():
    prev = menu("http://eyehere.net/2011/python-pygame-novice\
-professional-index/")
    for dic in prev:
        title = dic.get("title",'')
        url = dic.get("url",'')
        page = con(url)
        save(title,page)
        print "saved      ",unicode(title)

if __name__ == "__main__":
##    try:
        blog()
##    except Exception,e:
##        print e
        
        

上一篇: Spring data mongo like 查询
下一篇: 深入理解elasticsearch pdf 下载, elasticsearch in action
 评论 ( What Do You Think )
名称
邮箱
网址
评论
验证
   
 

 


  • 微信公众号

  • 我的微信

站点声明:

1、一号门博客CMS,由Python, MySQL, Nginx, Wsgi 强力驱动

2、部分文章来源于互联网, 若有侵权, 联系邮箱:summer@yihaomen.com, 同时欢迎大家注册用户,主动发布文章.

3、鄂ICP备14001754号-3, 鄂公网安备 42280202422812号