re模块爬取内涵段子使用了正则表达式和字符串之间的替换,也使用了urllib2模快获取链接整个html内容,接下来代码展示。
# -*- coding:utf-8 -*-
import re
import urllib2
class Content:
def __init__(self):
self.page = 1
def get_html(self):
# 获取整个网页的html内容
headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Mobile Safari/537.36"}
url = "http://www.neihan8.com/article/list_5_"+str(self.page)+".html"
request = urllib2.Request(url=url, headers=headers)
response = urllib2.urlopen(request)
html = response.read()
return html
def get_content(self):
pattern = re.compile(r'<div.*?class="f18 mb20">(.*?)</div>', re.S)
content_list = pattern.findall(self.get_html())
for content in content_list:
result_content = content.decode('gbk').replace("<p>", "").replace("</p>", "") \
.replace("“", "").replace("<br />", "") \
.replace("”", "").replace("&hellip", "")
with open("content.txt", "a") as file:
file.write(result_content.encode("utf-8"))
file.close
if __name__ == "__main__":
content = Content()
while True:
content.page+=1
print content.page
content.get_content()