数学建模社区-数学中国

标题: Python Scrapy-Redis 拉钩分布式爬虫 [打印本页]

作者: 浅夏110 时间: 2018-12-7 10:03
标题: Python Scrapy-Redis 拉钩分布式爬虫
settings文件添加以下内容
# ---------------------redis 配置--------------------------
# url指纹过滤器
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

# 调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 设置爬虫是否可以中断
SCHEDULER_PERSIST = True

# 设置请求队列类型
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue" # 按优先级入队列
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue" # 按照队列模式
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack" # 按照栈进行请求的调度

# 配置redis管道文件，权重数字相对最大
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 999, # redis管道文件，自动把数据加载到redis
}

# redis 连接配置
# 没密码
# REDIS_HOST = '127.0.0.1'
# REDIS_PORT = 6379
# 有密码
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
REDIS_PARAMS = {
'password' :'', # 密码
'db' : 0 # 指定使用哪个数据库
}
爬虫文件
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import LG_item
from ..tools.get_md5 import get_md5
import datetime
import re
from scrapy_redis.spiders import RedisCrawlSpider

class LagouSpider(RedisCrawlSpider):
name = 'lagou'
allowed_domains = ['lagou.com']
# start_urls = ['https://www.lagou.com','https://www.lagou.com/gongsi/']
# redis 关键字
redis_key = 'lagou_url'

custom_settings = {
      "COOKIES_ENABLED": False,
      # "DOWNLOAD_DELAY": 0.2,
      # "CONCURRENT_REQUESTS" : 32,
      # 管道
      'ITEM_PIPELINES': {
         'LaGou.pipelines.LGMysqlPipeline': 1
      },
      'DEFAULT_REQUEST_HEADERS': {
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Connection': 'keep-alive',
            'cookie': '_ga=GA1.2.1809666113.1526002829; user_trace_token=20180511094034-4c4d62d7-54bc-11e8-949f-525400f775ce; LGUID=20180511094034-4c4d6608-54bc-11e8-949f-525400f775ce; LG_LOGIN_USER_ID=537d2089412cae011d73a44bb8911986e2cf8ecc81522b3c; JSESSIONID=ABAAABAAAGFABEF2F6A133027057686F2D420CEB60B7F87; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1541598090,1542114329,1542774094,1543386087; _gid=GA1.2.118340539.1543386087; index_location_city=%E5%85%A8%E5%9B%BD; X_HTTP_TOKEN=fb416e9ede186e36ef1a080ebf43ceba; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22167593fd949988-079e956e17a40f-4313362-2073600-167593fd94a1f2%22%2C%22%24device_id%22%3A%22167593fd949988-079e956e17a40f-4313362-2073600-167593fd94a1f2%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; LGSID=20181128222414-482f1608-f319-11e8-8c3d-5254005c3644; TG-TRACK-CODE=index_navigation; SEARCH_ID=8304e9b43803439494c1c91c06395eca; _gat=1; LGRID=20181128233044-924a1586-f322-11e8-8c40-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543419045',
            'Host': 'www.lagou.com',
            'Origin': 'https://www.lagou.com',
            'Referer': 'https://www.lagou.com/',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
         }
      }

rules = (
      Rule(LinkExtractor(allow=(r'zhaopin/.*',)), follow=True),
      Rule(LinkExtractor(allow=(r'gongsi/j.*',)), follow=True),
      Rule(LinkExtractor(allow=r'jobs/.+html'), callback='parse_item', follow=True),
)

def parse_item(self, response):
      lagou_item = LG_item()

      try:
         # 提取内容
         title = response.css(".job-name::attr(title)").extract()[0]
         url = response.url
         url_object_id = get_md5(url)
         salary,job_city,work_years,degree_need,job_type = response.xpath('//dd[@class="job_request"]/p/span/text()').extract()
         tags = '|'.join(response.css('.position-label.clearfix li::text').extract())
         pulish_time = response.css('.publish_time::text').extract()[0]
         job_advantage =response.css('.job-advantage p::text').extract()[0]
         job_desc = response.css('.job_bt div').extract()[0]
         job_addr =re.sub(r'<.*?>|\n| |查看地图','', response.css('.work_addr').extract()[0])
         company_name =response.css('#job_company dt img::attr(alt)').extract()[0]
         company_url = response.xpath('//dl[@class="job_company"]/dd//li[4]/a/text()').extract_first("没有")
         crawl_time = datetime.datetime.now()

         lagou_item['title']= title
         lagou_item['url']= url
         lagou_item['url_object_id']= url_object_id
         lagou_item['salary']= salary
         lagou_item['job_city']= job_city
         lagou_item['work_years']= work_years
         lagou_item['degree_need']= degree_need
         lagou_item['job_type']= job_type
         lagou_item['tags']= tags
         lagou_item['pulish_time']= pulish_time
         lagou_item['job_advantage']= job_advantage
         lagou_item['job_desc']= job_desc
         lagou_item['job_addr']= job_addr
         lagou_item['company_name']= company_name
         lagou_item['company_url']= company_url
         lagou_item['crawl_time']= crawl_time

         yield lagou_item
      except Exception as e:
         print('出错了: ',e)
运行爬虫
scrapy runspider 爬虫文件名
让爬虫工作起来
连接redis ，添加一个列表

lpush lagou_url https://www.lagou.com

---------------------
作者：Test_C.
来源：CSDN
原文：https://blog.csdn.net/weixin_42544006/article/details/84724948
版权声明：本文为博主原创文章，转载请附上博文链接！

欢迎光临数学建模社区-数学中国 (http://www.madio.net/)