数学建模社区-数学中国

标题: Python Scrapy-Redis 拉钩分布式爬虫 [打印本页]

作者: 浅夏110    时间: 2018-12-7 10:03
标题: Python Scrapy-Redis 拉钩分布式爬虫
settings文件添加以下内容
# ---------------------redis 配置--------------------------
# url指纹过滤器
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

# 调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 设置爬虫是否可以中断
SCHEDULER_PERSIST = True

# 设置请求队列类型
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue" # 按优先级入队列
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue" # 按照队列模式
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack" # 按照栈进行请求的调度

# 配置redis管道文件,权重数字相对最大
ITEM_PIPELINES = {
    'scrapy_redis.pipelines.RedisPipeline': 999, # redis管道文件,自动把数据加载到redis
}

# redis 连接配置
# 没密码
# REDIS_HOST = '127.0.0.1'
# REDIS_PORT = 6379
# 有密码
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
REDIS_PARAMS = {
    'password' :'', # 密码
    'db' : 0 # 指定使用哪个数据库
}
爬虫文件
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import LG_item
from ..tools.get_md5 import get_md5
import datetime
import re
from scrapy_redis.spiders import RedisCrawlSpider


class LagouSpider(RedisCrawlSpider):
    name = 'lagou'
    allowed_domains = ['lagou.com']
    # start_urls = ['https://www.lagou.com','https://www.lagou.com/gongsi/']
    # redis 关键字
    redis_key = 'lagou_url'

    custom_settings = {
        "COOKIES_ENABLED": False,
        # "DOWNLOAD_DELAY": 0.2,
        # "CONCURRENT_REQUESTS" : 32,
        # 管道
        'ITEM_PIPELINES': {
            'LaGou.pipelines.LGMysqlPipeline': 1
        },
        'DEFAULT_REQUEST_HEADERS': {
                'Accept': 'application/json, text/javascript, */*; q=0.01',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.8',
                'Connection': 'keep-alive',
                'cookie': '_ga=GA1.2.1809666113.1526002829; user_trace_token=20180511094034-4c4d62d7-54bc-11e8-949f-525400f775ce; LGUID=20180511094034-4c4d6608-54bc-11e8-949f-525400f775ce; LG_LOGIN_USER_ID=537d2089412cae011d73a44bb8911986e2cf8ecc81522b3c; JSESSIONID=ABAAABAAAGFABEF2F6A133027057686F2D420CEB60B7F87; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1541598090,1542114329,1542774094,1543386087; _gid=GA1.2.118340539.1543386087; index_location_city=%E5%85%A8%E5%9B%BD; X_HTTP_TOKEN=fb416e9ede186e36ef1a080ebf43ceba; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22167593fd949988-079e956e17a40f-4313362-2073600-167593fd94a1f2%22%2C%22%24device_id%22%3A%22167593fd949988-079e956e17a40f-4313362-2073600-167593fd94a1f2%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; LGSID=20181128222414-482f1608-f319-11e8-8c3d-5254005c3644; TG-TRACK-CODE=index_navigation; SEARCH_ID=8304e9b43803439494c1c91c06395eca; _gat=1; LGRID=20181128233044-924a1586-f322-11e8-8c40-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543419045',
                'Host': 'www.lagou.com',
                'Origin': 'https://www.lagou.com',
                'Referer': 'https://www.lagou.com/',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
            }
        }


    rules = (
        Rule(LinkExtractor(allow=(r'zhaopin/.*',)), follow=True),
        Rule(LinkExtractor(allow=(r'gongsi/j.*',)), follow=True),
        Rule(LinkExtractor(allow=r'jobs/.+html'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        lagou_item = LG_item()

        try:
            # 提取内容
            title = response.css(".job-name::attr(title)").extract()[0]
            url = response.url
            url_object_id = get_md5(url)
            salary,job_city,work_years,degree_need,job_type = response.xpath('//dd[@class="job_request"]/p/span/text()').extract()
            tags = '|'.join(response.css('.position-label.clearfix li::text').extract())
            pulish_time = response.css('.publish_time::text').extract()[0]
            job_advantage =response.css('.job-advantage p::text').extract()[0]
            job_desc = response.css('.job_bt div').extract()[0]
            job_addr =re.sub(r'<.*?>|\n| |查看地图','', response.css('.work_addr').extract()[0])
            company_name =response.css('#job_company dt img::attr(alt)').extract()[0]
            company_url = response.xpath('//dl[@class="job_company"]/dd//li[4]/a/text()').extract_first("没有")
            crawl_time = datetime.datetime.now()


            lagou_item['title']= title
            lagou_item['url']= url
            lagou_item['url_object_id']= url_object_id
            lagou_item['salary']= salary
            lagou_item['job_city']= job_city
            lagou_item['work_years']= work_years
            lagou_item['degree_need']= degree_need
            lagou_item['job_type']= job_type
            lagou_item['tags']= tags
            lagou_item['pulish_time']= pulish_time
            lagou_item['job_advantage']= job_advantage
            lagou_item['job_desc']= job_desc
            lagou_item['job_addr']= job_addr
            lagou_item['company_name']= company_name
            lagou_item['company_url']= company_url
            lagou_item['crawl_time']= crawl_time

            yield lagou_item
        except Exception as e:
            print('出错了: ',e)
运行爬虫
scrapy runspider 爬虫文件名
让爬虫工作起来
连接redis ,添加一个列表

lpush lagou_url https://www.lagou.com

---------------------
作者:Test_C.
来源:CSDN
原文:https://blog.csdn.net/weixin_42544006/article/details/84724948
版权声明:本文为博主原创文章,转载请附上博文链接!






欢迎光临 数学建模社区-数学中国 (http://www.madio.net/) Powered by Discuz! X2.5