scrapy_plus实现腾讯招聘爬虫
# 腾讯招聘爬虫案例
# 1 腾讯招聘爬虫代码
from scrapy_plus.core.spider import Spider
from scrapy_plus.http.request import Request
class TencentSpider(Spider):
name = 'tencent'
start_urls = ['https://hr.tencent.com/position.php']
def parse(self, response): # 对start_urls进行解析
print(response.url + '*****')
tr_list = response.xpath('//*[@class="tablelist"]//tr')[1:-1]
print(len(tr_list))
for tr in tr_list:
item = {}
# 获取一部分数据
item['name'] = tr.xpath('./td[1]/a/text()')[0]
item['address'] = tr.xpath('./td[4]/text()')[0]
item['time'] = tr.xpath('./td[5]/text()')[0]
# 获取详情页url,并发送请求
detail_url = 'https://hr.tencent.com/' + tr.xpath('./td[1]/a/@href')[0]
print(detail_url)
yield Request(
detail_url,
parse='parse_detail',
meta=item # meta接收一个字典
)
# 翻页
print(response.xpath('//a[text()="下一页"]/@href')[0])
next_url = 'https://hr.tencent.com/' + response.xpath('//a[text()="下一页"]/@href')[0]
if response.xpath('//a[text()="下一页"]/@href')[0] != 'javascript:;':
yield Request(next_url, parse='parse')
def parse_detail(self, response):
# print(response.body)
item = response.meta # 获取传入的meta
item['job_content'] = response.xpath('//*[@class="squareli"]//text()')[0] # 加入岗位职责数据
print(item)
yield item
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# 修改项目的settings.py
......
# 启用的爬虫类
SPIDERS = [
# 'spiders.baidu.BaiduSpider',
# 'spiders.baidu2.Baidu2Spider',
# 'spiders.douban.DoubanSpider',
'spiders.tencent.TencentSpider',
]
......
1
2
3
4
5
6
7
8
9
2
3
4
5
6
7
8
9
# 此时运行项目的main.py程序异常
IndexError: list index out of range
经过debug我们发现,是因为请求头缺少了 User-Agent!
# 2 修改scrapy_plus代码
# scrapy_plus/conf/default_settings.py
......
# 默认请求头
HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}
1
2
3
4
5
2
3
4
5
# scrapy_plus/core/spider.py
......
from scrapy_plus.conf.settings import HEADERS
......
def start_requests(self):
for url in self.start_urls:
# 此处修改
yield Request(url, headers=HEADERS, filter=False)
def parse(self, response):
yield Item(response.body)
1
2
3
4
5
6
7
8
9
10
11
12
2
3
4
5
6
7
8
9
10
11
12
# scrapy_plus/http/request.py
from scrapy_plus.conf.settings import HEADERS # 此处新增
class Request():
"""这是封装的request对象"""
def __init__(self, url, method='GET', data=None, headers=HEADERS, parse='parse', meta={}, filter=True): # 此处修改
......
1
2
3
4
5
6
7
8
9
10
2
3
4
5
6
7
8
9
10
编辑 (opens new window)