Python抓取QQ空间博客文章
Python抓取QQ空间博客文章
外面挂着台风,下午把人人相册的爬虫写了,晚上偶无聊又把QQ空间的博客的爬虫写了,默认只抓取提供的Q号的空间,可以在main.py里面填上Q号,也可以加个循环弄很多个Q号....博客里面的图片就木有理它了,要下载回来也很简单。有空再完善了。
- # -*-coding:utf-8-*-
- # Filename: main.py
- # 作者:华亮
- #
- from QQ import QQ
- if __name__ == '__main__':
- # 第一个参数为QQ号,第二个为保存文件名
- QQ.DownloadBlog('414112390', 'blog.txt')
- # -*-coding:utf-8-*-
- # Filename: QQ.py
- # 作者:华亮
- #
- import urllib
- import urllib2
- import re
- from HTMLParser import HTMLParser
- # 获取QQ空间博客列表
- class QQBlogList(HTMLParser):
- in_key_div = False
- in_ul = False
- in_li = False
- in_a = False
- blogList = []
- lasturl = ''
- def handle_starttag(self, tag, attrs):
- attrs = dict(attrs)
- if tag == 'div' and 'class' in attrs and attrs['class'] == 'bloglist':
- self.in_key_div = True
- elif self.in_key_div:
- if tag == 'ul':
- self.in_ul = True
- elif self.in_ul and tag == 'li':
- self.in_li = True
- elif self.in_li and tag == 'a' and 'href' in attrs:
- self.in_a = True
- self.lasturl = attrs['href']
- def handle_data(self, data):
- if self.in_a:
- self.blogList.append((data, self.lasturl))
- def handle_endtag(self, tag):
- if self.in_key_div and tag == 'div':
- self.in_key_div = False
- elif self.in_ul and tag == 'ul':
- self.in_ul = False
- elif self.in_li and tag == 'li':
- self.in_li = False
- elif self.in_a and tag == 'a':
- self.in_a = False
- class QQ:
- '''''
- 作者:华亮
- 说明:自动下载QQ空间博客文章
- '''
- @staticmethod
- def DownloadBlog(qq, filename = None):
- print 'Start'
- blogurl = 'http://qz.qq.com/%s/bloglist?page=0' % qq
- QQ.__Download(blogurl, filename)
- print 'End'
- @staticmethod
- def __Download(starturl, filename):
- url = starturl
- cookieFile = urllib2.HTTPCookieProcessor()
- opener = urllib2.build_opener(cookieFile)
- # 获取所有页的文章路径
- while True:
- req = urllib2.Request(url)
- result = opener.open(req)
- text = result.read()
- qq = QQBlogList()
- qq.feed(text)
- qq.close()
- nextpagePattern = re.compile(r'<a href="(.*?)" title="下一页" class="bt_next"><span>下一页</span></a>')
- nextpage = nextpagePattern.search(text)
- if nextpage:
- url = nextpage.group(1)
- else:
- break
- if not filename:
- filename = "blog.txt"
- file = open(filename, 'w')
- # 下载文章
- blogContentPattern = re.compile(r'<div class="entry_content">(.*?)</div>', re.S)
- for title, url in qq.blogList:
- print 'Downloading', title
- req = urllib2.Request(url)
- result = opener.open(req)
- file.write('\n' + title + '\n')
- ret = blogContentPattern.search( result.read() )
- if ret:
- file.write(ret.group(1).replace('<p>', '\n'))
- file.close()
评论暂时关闭