Python爬虫系列——Scrapy爬虫实例

Python爬虫系列——爬取豆瓣电影信息

创建项目

  • 命令行工具:scrapy startproject DoubanBookSpider
  • 一般先在items.py配置数据字段
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
class DoubanMovieCommentItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field() # 标题
re_content = scrapy.Field() # 评论内容
re_time = scrapy.Field() # 影评发表时间
re_author = scrapy.Field() # 影评作者
re_title = scrapy.Field() # 影评标题
# othername = scrapy.Field() # 影片别名
# url = scrapy.Field() # 电影链接
# duration = scrapy.Field() # 视频时长/秒
# date = scrapy.Field() # 发行时间
# director = scrapy.Field() # 导演
# actors = scrapy.Field() # 演员
# style = scrapy.Field() # 电影类型
# area = scrapy.Field() # 影片地区
  • settings.py中配置相关信息
1
2
3
4
5
6
7
8
9
10
ROBOTSTXT_OBEY = True   # 遵守robot.txt
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
}
# 注:由于一个项目中新建了多个爬虫,运行不同的爬虫在这里 打开具体的ItemPipeline
ITEM_PIPELINES = {
# 'DoubanBookSpider.pipelines.DoubanbookspiderPipeline': 300,
# 'DoubanBookSpider.pipelines.DoubanMailPipeline': 600,
'DoubanBookSpider.pipelines.DoubanMovieCommentPipeline': 900,
}

数据存储

  • 前面介绍了数据抓取、数据解析,这也是准备爬虫的最后一步,数据存储
  • 使用pymysql库操作mysql数据库存储爬下来的数据
  • 这部分自然是在数据处理部分pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
import pymysql
class DoubanMovieCommentPipeline(object):
def process_item(self, item, spider):
connect = pymysql.connect(user="root", password="root", port=3306, host="127.0.0.1", db="douban_movie",
charset="utf8")
con = connect.cursor()
con.execute("insert into douban_review(title,re_author,re_title,re_content,re_time) values(%s,%s,%s,%s,%s)",
[item['title'], item['re_author'], item['re_title'], item['re_content'], item['re_time']])
connect.commit()
con.close()
connect.close()
print('/////////////////////写入一条//////////////////////')

你的电脑需要安装mysql数据库,可以使用Navicat操作

爬取数据

  • 无需提交登录信息
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# -*- coding: utf-8 -*-
import scrapy
from DoubanBookSpider.items import DoubanMovieCommentItem
import json
from scrapy import Selector
import re
import time

class DoubanSpider(scrapy.Spider):
name = "doubanmovie"
allowed_domains = ["https://movie.douban.com"]
start_urls = ['movie.douban.com']
list1 = ['教父', '肖申克的救赎', '卡萨布兰卡', '卧虎藏龙', '天使爱美丽', '这个杀手不太冷', '千与千寻', '窃听风暴', '阿甘正传', '霸王别姬', '大话西游', '无间道', '阿凡达','画皮', '让子弹飞', '唐山大地震', '金陵十三钗', '龙门飞甲', '非诚勿扰', '盗梦空间', '建国大业', '十月围城', '满城尽带黄金甲', '速度与激情5', '集结号', '画皮',]
num = 0
header = {'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
'Accept-Encoding': 'gzip, deflate',
}

def start_requests(self): # override
url = 'https://movie.douban.com/'
# 遇到模拟表单或Ajax提交post请求的时候使用Request的子类
yield scrapy.FormRequest(
url=url,
callback=self.parse, headers=self.header, encoding='utf=8'
)

# 按照list1数据进行电影内容获取
def parse(self, response):
for i in self.list1:
url = 'https://movie.douban.com/j/subject_suggest?q={0}'.format(i)
# 实例化Request对象后,传递url给下载器,得到response
# 获取某个电影不同版本的所有信息
yield scrapy.Request(url, callback=self.link_page, headers=self.header,dont_filter=True) # https://movie.douban.com/subject/26363254/reviews

# 从所有信息中得到电影不同版本的页面链接
def link_page(self, response):
# "url":"https:\/\/movie.douban.com\/subject\/1291841\/?suggest=%7B%E6%95%99%E7%88%B6%7D"
u = re.search('"url":"(.*?)"', response.text) # 匹配上面的注释内容
if u is not None:
url = u.group(1).replace('\/', '/') # 第一个括号规则中的匹配字符 https:\/\/movie.douban.com\/subject\/1291841\/?suggest=%7B%E6%95%99%E7%88%B6%7D"
id = re.search('(\d+)', url) # 1291841
meta = {}
meta['id'] = id.group(1) # 要携带或者传递的信息 字典形式
yield scrapy.Request(url, callback=self.parse_page, meta=meta, headers=self.header, dont_filter=True)

# 解析电影介绍页面
def parse_page(self, response):
# print(response.url)#https://movie.douban.com/subject/26363254/
id = response.meta['id']
# 使用css选择器的xpath定位
title = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract_first()
type = response.xpath('//*[@id="info"]/span[5]/text()').extract_first()
if type is not None:
style = type
for node in response.xpath('//*[@id="info"]/span').extract():
selector = Selector(text=node)
des = selector.xpath('//span')
content = des[0].xpath('normalize-space(string(.))').extract()[0].replace('\xa0', '')
if '导演' in content:
director = content.replace('导演:', '')
elif '编剧:' in content:
author = content.replace('编剧:', '')
elif '主演:' in content:
actor = content.replace('主演:', '')
elif '分钟' in content:
duration = content.replace('分钟', '')
elif '制片国家/地区:' in content:
area = content.replace('制片国家/地区:', '')
elif '又名:' in content:
an_title = content.replace('又名:', '')
t = re.search(r'(\d{4}-\d{2}-\d{2})', content)
if t is not None:
time = t.group(1)
# 到影评页面
re_url = 'https://movie.douban.com/subject/' + str(id) + '/reviews'
# https://movie.douban.com//subject//1291841//reviews
meta = {}
meta['review_url'] = re_url
yield scrapy.Request(re_url, callback=self.review_page, meta=meta, headers=self.header,
dont_filter=True) # https://movie.douban.com/subject/26363254/reviews

# 总览影评页面,每个影评的详细内容都会跳转到新的影评页面
def review_page(self, response):
review_url = response.meta['review_url']
meta = {}
meta['review_url'] = review_url
# 本页所有影评:
# 用正则表达式匹配所有为数字的id选择器 :[re:match(@id, "\d+")]
# 或者://*[@id="content"]/div/div[1]/div[1]/div
resultList = response.xpath(r'//*[re:match(@id, "\d+")]',)
# print('resultList: ',resultList)
for res in resultList:
self.num = self.num + 1
author = res.xpath('./header/a[2]/text()').extract_first()
if author:
# pubdate=res.xpath('./header/span[2]/text()').extract_first()
url = res.xpath('./div/h2/a/@href').extract_first() # 某个具体影评页面
# title=res.xpath('./div/h2/a/text()').extract_first()
# 对某个具体影评进行解析
yield scrapy.Request(url, callback=self.review_parse, headers=self.header, dont_filter=True)
# //*[@id="content"]/div/div[1]/div[2]/span[4]
# 后页影评
next_link = response.xpath('//*[@id="content"]/div/div[1]/div[2]/span[4]/a/@href').extract_first()
# 页数:2 3 4...
next_page = response.xpath('//*[@id="content"]/div/div[1]/div[2]/span[4]/a/text()').extract_first()
# 后页即下一页,找到最后一页进行翻页
if next_page == '后页>' and self.num <= 40:
next_url = review_url + next_link
# print(next_url)
print('====================爬取下一页==========================')
time.sleep(30)
yield scrapy.Request(next_url, callback=self.review_page, meta=meta, headers=self.header, dont_filter=True)

# 解析具体影评页面
def review_parse(self, response):
item = DoubanMovieCommentItem()
title = response.xpath('//*[@id="content"]/div/div[1]/h1/span/text()').extract_first()
if title:
item['re_title'] = title
else:
item['re_title'] = ' '
des = response.xpath('//*[@id="link-report"]/div[1]')
content = des[0].xpath('normalize-space(string(.))').extract()[0].replace('\xa0', '')
if content:
item['re_content'] = content
else:
item['re_content'] = ' '
print(title)
resultList = response.xpath(r'//*[re:match(@id, "\d+")]', )
for res in resultList:
author = res.xpath('./header/a[1]/span/text()').extract_first()
if author:
dianying_name = res.xpath('./header/a[2]/text()').extract_first()
pubdate = res.xpath('./header/span[3]/text()').extract_first()
item['re_author'] = author
if dianying_name:
item['title'] = dianying_name
else:
item['title'] = ' '
if pubdate:
item['re_time'] = pubdate
else:
item['re_time'] = ' '
yield item

执行命令:scrapy crawl doubanmovie,即可开始爬取

登录爬取影评

  • 需要提交登录信息的表单,一般使用post提交,可以在开发者工具中的network查看 From-Data
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import scrapy
from faker import Factory
from DoubanBookSpider.items import DoubanMovieCommentItem
from urllib import parse
f = Factory.create()


class MailSpider(scrapy.Spider):
name = 'douban-comment'
allowed_domains = ['accounts.douban.com', 'douban.com']
start_urls = [
'https://www.douban.com/'
]

headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Connection': 'keep-alive',
'Host': 'accounts.douban.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'# f.user_agent() # 搞个假的
}

# 登录提交的表单
formdata = {
'form_email': 'szsplyr@163.com',
'form_password': '******',
# 'captcha-solution': '',
# 'captcha-id': '',
'login': '登录',
'redir': 'https://www.douban.com/',
'source': 'None'
}

def start_requests(self):
return [scrapy.Request(url='https://www.douban.com/accounts/login',
headers=self.headers,
meta={'cookiejar': 1},
callback=self.parse_login)]

def parse_login(self, response):
# 如果有验证码要人为处理
if response.body==None: # 'captcha_image' in response.body
print('Copy the link:')
link = response.xpath('//img[@class="captcha_image"]/@src').extract()[0]
print('captcha link: ',link)
captcha_solution = input('captcha-solution:')
# 验证码ID
captcha_id = parse.urlparse(link, True)['id']
self.formdata['captcha-solution'] = captcha_solution
self.formdata['captcha-id'] = captcha_id
# 登录的完整表单信息
return [scrapy.FormRequest.from_response(response,
formdata=self.formdata,
headers=self.headers,
meta={'cookiejar': response.meta['cookiejar']},
callback=self.after_login
)]
# 解析页面
def after_login(self, response):
print(response.status)
self.headers['Host'] = "www.douban.com"
yield scrapy.Request(url='https://movie.douban.com/subject/22266320/reviews',
meta={'cookiejar': response.meta['cookiejar']},
headers=self.headers,
callback=self.parse_comment_url)
yield scrapy.Request(url='https://movie.douban.com/subject/22266320/reviews',
meta={'cookiejar': response.meta['cookiejar']},
headers=self.headers,
callback=self.parse_next_page,
dont_filter = True) #不去重

# 爬取下一页影评
def parse_next_page(self, response):
print(response.status)
try:
# 不是完整的url,需要拼接
next_url = response.urljoin(response.xpath('//span[@class="next"]/a/@href').extract()[0])
print("下一页")
print(next_url)
yield scrapy.Request(url=next_url,
meta={'cookiejar': response.meta['cookiejar']},
headers=self.headers,
callback=self.parse_comment_url,
dont_filter = True)
yield scrapy.Request(url=next_url,
meta={'cookiejar': response.meta['cookiejar']},
headers=self.headers,
callback=self.parse_next_page,
dont_filter = True)
except:
print("Next page Error")
return

# 可以使用for循环配合yield将上面两个函数写在一起,这样写是为了容易理解

# 处理一个页面的信息
def parse_comment_url(self, response):
print(response.status)
for item in response.xpath('//div[@class="main review-item"]'):
comment_url = item.xpath('header/h3[@class="title"]/a/@href').extract()[0]
comment_title = item.xpath('header/h3[@class="title"]/a/text()').extract()[0]
print(comment_title)
print(comment_url)
yield scrapy.Request(url=comment_url,
meta={'cookiejar': response.meta['cookiejar']},
headers=self.headers,
callback=self.parse_comment)

# 对得到的数据处理
def parse_comment(self, response):
print(response.status)
for item in response.xpath('//div[@id="content"]'):
comment = DoubanMovieCommentItem()
comment['useful_num'] = item.xpath('//div[@class="main-panel-useful"]/button[1]/text()').extract()[0].strip()
comment['no_help_num'] = item.xpath('//div[@class="main-panel-useful"]/button[2]/text()').extract()[0].strip()
comment['people'] = item.xpath('//span[@property="v:reviewer"]/text()').extract()[0]
comment['people_url'] = item.xpath('//header[@class="main-hd"]/a[1]/@href').extract()[0]
comment['star'] = item.xpath('//header[@class="main-hd"]/span[1]/@title').extract()[0]

data_type = item.xpath('//div[@id="link-report"]/div/@data-original').extract()[0]
print("data_type: "+data_type)
if data_type == '0':
comment['comment'] = "\t#####\t".join(map(lambda x:x.strip(), item.xpath('//div[@id="link-report"]/div/p/text()').extract()))
elif data_type == '1':
comment['comment'] = "\t#####\t".join(map(lambda x:x.strip(), item.xpath('//div[@id="link-report"]/div[1]/text()').extract()))
comment['title'] = item.xpath('//span[@property="v:summary"]/text()').extract()[0]
comment['comment_page_url'] = response.url
yield comment

总结

​ 如今数据爬取难度持续增大,技术要求也越来越高,今天看到的实例可能过几天就失效了。很多网站需要使用JavaScript逆向,由于APP也是数据的重要载体,App的逆向也几乎已经是爬虫必备的技能……当然这也不完全是坏事,这让企业对爬虫工程师的需求量在逐步增多,薪资待遇也提升了不少。这里只是“照猫画虎”的介绍了爬虫的基本使用知识,如果你有志做个爬虫“黑客”,需要投入更多的精力,可以从这里开始,一步步进阶 !

​ 同时,爬虫本身也是在打法律的擦边球,对于数据获取应该注重隐私,有法律意识!

​ 如果你只是有一些爬取数据的需求,可以使用八爪鱼数据采集器,可以尝试使用Python的jieba分词和pandas模块实现数据统计。

------ ���Ľ���------