scrapy_plus实现腾讯招聘爬虫
  # 腾讯招聘爬虫案例
# 1 腾讯招聘爬虫代码
from scrapy_plus.core.spider import Spider
from scrapy_plus.http.request import Request
class TencentSpider(Spider):
    name = 'tencent'
    start_urls = ['https://hr.tencent.com/position.php']
    def parse(self, response): # 对start_urls进行解析
        print(response.url + '*****')
        tr_list = response.xpath('//*[@class="tablelist"]//tr')[1:-1]
        print(len(tr_list))
        for tr in tr_list:
            item = {}
            # 获取一部分数据
            item['name'] = tr.xpath('./td[1]/a/text()')[0]
            item['address'] = tr.xpath('./td[4]/text()')[0]
            item['time'] = tr.xpath('./td[5]/text()')[0]
            # 获取详情页url,并发送请求
            detail_url = 'https://hr.tencent.com/' + tr.xpath('./td[1]/a/@href')[0]
            print(detail_url)
            yield Request(
                detail_url,
                parse='parse_detail',
                meta=item # meta接收一个字典
            )
        # 翻页
        print(response.xpath('//a[text()="下一页"]/@href')[0])
        next_url = 'https://hr.tencent.com/' + response.xpath('//a[text()="下一页"]/@href')[0]
        if response.xpath('//a[text()="下一页"]/@href')[0] != 'javascript:;':
            yield Request(next_url, parse='parse')
    def parse_detail(self, response):
        # print(response.body)
        item = response.meta # 获取传入的meta
        item['job_content'] = response.xpath('//*[@class="squareli"]//text()')[0] # 加入岗位职责数据
        print(item)
        yield item 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# 修改项目的settings.py
......
# 启用的爬虫类
SPIDERS = [
    # 'spiders.baidu.BaiduSpider',
    # 'spiders.baidu2.Baidu2Spider',
    # 'spiders.douban.DoubanSpider',
    'spiders.tencent.TencentSpider',
]
...... 
 1
2
3
4
5
6
7
8
9
2
3
4
5
6
7
8
9
# 此时运行项目的main.py程序异常
IndexError: list index out of range
经过debug我们发现,是因为请求头缺少了 User-Agent!
# 2 修改scrapy_plus代码
# scrapy_plus/conf/default_settings.py
......
# 默认请求头
HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'} 
 1
2
3
4
5
2
3
4
5
# scrapy_plus/core/spider.py
......
from scrapy_plus.conf.settings import HEADERS
......
    def start_requests(self):
        for url in self.start_urls:
            # 此处修改
            yield Request(url, headers=HEADERS, filter=False)
    def parse(self, response):
        yield Item(response.body) 
 1
2
3
4
5
6
7
8
9
10
11
12
2
3
4
5
6
7
8
9
10
11
12
# scrapy_plus/http/request.py
from scrapy_plus.conf.settings import HEADERS # 此处新增
class Request():
    """这是封装的request对象"""
    def __init__(self, url, method='GET', data=None, headers=HEADERS, parse='parse', meta={}, filter=True): # 此处修改
        ...... 
 1
2
3
4
5
6
7
8
9
10
2
3
4
5
6
7
8
9
10
编辑  (opens new window)