爬取某pygame技术博客
By:Roy.LiuLast updated:2016-06-30
听说lxml 性能比 beautsoup 强,所以测试下, 玩玩这个东西怎么样
需要用到的包: https://pypi.python.org/pypi/lxml, 这里下载适合自己的版本, 然后跑下面的测试代码
需要用到的包: https://pypi.python.org/pypi/lxml, 这里下载适合自己的版本, 然后跑下面的测试代码
# -*- coding: utf-8 -*-
from urllib2 import urlopen,Request
import urllib
from lxml import *
import lxml.html as HTML
import time
def error(txt):
with open("../it/error.txt","a") as f:
f.write(txt + '\n')
def con(url,count=4):
try:
req = Request(url)
req.add_header('Referer','http://www.baidu.com')
req.add_header('User-Agent','Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
res = urlopen(req,timeout = 20)
page = res.read()
res.close()
#dom = HTML.document_fromstring(page)
return page
except Exception,e:
if count >= 10:
print e
error(url)
else:
count += 1
time.sleep(1)
return con(url,count)
def menu(url):
page = con(url)
dom = HTML.document_fromstring(page)
path = "//h5/a"
node = dom.xpath(path)
for n in node:
dic = {}
dic['title'] = n.text_content()
dic['url'] = "http:" + n.get("href")
if dic['title'] and dic['url']:
yield dic
def save(title,content):
with open('../it/'+unicode(title)+'.html','w') as f:
f.write(content)
def blog():
prev = menu("http://eyehere.net/2011/python-pygame-novice\
-professional-index/")
for dic in prev:
title = dic.get("title",'')
url = dic.get("url",'')
page = con(url)
save(title,page)
print "saved ",unicode(title)
if __name__ == "__main__":
## try:
blog()
## except Exception,e:
## print e
From:一号门
Previous:Spring data mongo like 查询

COMMENTS