本文共 1381 字,大约阅读时间需要 4 分钟。
由于之前的改造,现在将爬虫的功能做了一些改变,具体实现的功能是将推荐的日志全部抓取下来,并存放在以文章标题命名的文件中,代码如下:
import urllib
import os,re
import sys
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding("utf-8")
def if_str(str_t):
if re.search(r"^.*[a-zA-Z].*",str_t)== None:
print " saf"
def get_blog(url):
page = urllib.urlopen(url).read()
if len(page)!=0:
if BeautifulSoup(page).title != None:
str_title = BeautifulSoup(page).title.string #获取title名称,并作为文件名称
if re.search(r"^.*[a-zA-Z|\s\",<>].*",str_title) == None:
fp = file("%s.txt" % str_title,"w")
page_js = r"<!-- 正文开始 -->[\s\S]*<!-- 正文结束 -->" # 正则匹配文章正文部分
if re.search(page_js,page):
soup = BeautifulSoup(re.search(page_js,page).group(0),from_encoding="gb18030")
for div in soup.find_all("div"):
fp.write(div.get_text().lstrip())
fp.close()
if "__main__"==__name__:
i = 1
if i<7:
for j in range(1,140):
url = "http://roll.ent.sina.com.cn/blog/star/index_" + str(i) +".shtml"
fp = file("EveryPageHref.txt","a")
fp.write(url)
fp.write("\n")
fp.close()
i+=1
page = urllib.urlopen(url).read()
soup = BeautifulSoup(page,from_encoding = "gb18030")
list_ul = soup.find_all("ul",class_="list_009")
list_li = list_ul[0].find_all("li")
for li in list_li:
l ist_a = li.find_all("a")
one_link = list_a[1].get("href") #获取连接
str_title = list_a[0].get_text()
if one_link != "http://blog.sina.com.cn/s/blog_4a6c545e0102vgwe.html":
get_blog(one_link)
print "OK!"
另外附上一张成果图: