爬取某pygame技术博客

By：Roy.LiuLast updated：2016-06-30

听说lxml 性能比 beautsoup 强，所以测试下, 玩玩这个东西怎么样
需要用到的包: https://pypi.python.org/pypi/lxml, 这里下载适合自己的版本, 然后跑下面的测试代码

# -*- coding: utf-8 -*-
from urllib2 import urlopen,Request
import urllib
from lxml import *
import lxml.html as HTML
import time
def error(txt):
    with open("../it/error.txt","a") as f:
        f.write(txt + '\n')
def con(url,count=4):
    try:
        req = Request(url)
        req.add_header('Referer','http://www.baidu.com')
        req.add_header('User-Agent','Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
        res = urlopen(req,timeout = 20)
        page = res.read()
        res.close()
        #dom = HTML.document_fromstring(page)
        return page
    except Exception,e:
        if count >= 10:
            print e
            error(url)
        else:
            count += 1
            time.sleep(1)
            return con(url,count)
def menu(url):
    page = con(url)
    dom = HTML.document_fromstring(page)
    path = "//h5/a"
    node = dom.xpath(path)
    for n in node:
        dic = {}
        dic['title'] = n.text_content()
        dic['url'] = "http:" + n.get("href")
        if dic['title'] and dic['url']:
            yield dic
def save(title,content):
    with open('../it/'+unicode(title)+'.html','w') as f:
        f.write(content)
def blog():
    prev = menu("http://eyehere.net/2011/python-pygame-novice\
-professional-index/")
    for dic in prev:
        title = dic.get("title",'')
        url = dic.get("url",'')
        page = con(url)
        save(title,page)
        print "saved      ",unicode(title)

if __name__ == "__main__":
##    try:
        blog()
##    except Exception,e:
##        print e

From：一号门

Tags: python 爬虫

Previous:Spring data mongo like 查询

Next:深入理解elasticsearch pdf 下载, elasticsearch in action

COMMENTS

爬取某pygame技术博客

RELATED ARTICLES

COMMENTS