- 在线时间
 - 791 小时
 - 最后登录
 - 2022-11-28
 - 注册时间
 - 2017-6-12
 - 听众数
 - 15
 - 收听数
 - 0
 - 能力
 - 120 分
 - 体力
 - 36205 点
 - 威望
 - 11 点
 - 阅读权限
 - 255
 - 积分
 - 13802
 - 相册
 - 0
 - 日志
 - 0
 - 记录
 - 1
 - 帖子
 - 616
 - 主题
 - 542
 - 精华
 - 10
 - 分享
 - 0
 - 好友
 - 225
  
 
 
 
TA的每日心情  | 开心 2020-11-14 17:15 | 
|---|
 
  签到天数: 74 天 [LV.6]常住居民II 
 
 
 
群组: 2019美赛冲刺课程 群组: 站长地区赛培训 群组: 2019考研数学 桃子老师 群组: 2018教师培训(呼伦贝 群组: 2019考研数学 站长系列  | 
settings文件添加以下内容 
# ---------------------redis 配置-------------------------- 
# url指纹过滤器 
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 
  
# 调度器 
SCHEDULER = "scrapy_redis.scheduler.Scheduler" 
# 设置爬虫是否可以中断 
SCHEDULER_PERSIST = True 
  
# 设置请求队列类型 
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue" # 按优先级入队列 
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue" # 按照队列模式 
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack" # 按照栈进行请求的调度 
  
# 配置redis管道文件,权重数字相对最大 
ITEM_PIPELINES = { 
    'scrapy_redis.pipelines.RedisPipeline': 999, # redis管道文件,自动把数据加载到redis 
} 
  
# redis 连接配置 
# 没密码 
# REDIS_HOST = '127.0.0.1' 
# REDIS_PORT = 6379 
# 有密码 
REDIS_HOST = '127.0.0.1' 
REDIS_PORT = 6379 
REDIS_PARAMS = { 
    'password' :'', # 密码 
    'db' : 0 # 指定使用哪个数据库 
} 
爬虫文件 
# -*- coding: utf-8 -*- 
import scrapy 
from scrapy.linkextractors import LinkExtractor 
from scrapy.spiders import CrawlSpider, Rule 
from ..items import LG_item 
from ..tools.get_md5 import get_md5 
import datetime 
import re 
from scrapy_redis.spiders import RedisCrawlSpider 
  
  
class LagouSpider(RedisCrawlSpider): 
    name = 'lagou' 
    allowed_domains = ['lagou.com'] 
    # start_urls = ['https://www.lagou.com','https://www.lagou.com/gongsi/'] 
    # redis 关键字  
    redis_key = 'lagou_url' 
  
    custom_settings = { 
        "COOKIES_ENABLED": False, 
        # "DOWNLOAD_DELAY": 0.2, 
        # "CONCURRENT_REQUESTS" : 32, 
        # 管道 
        'ITEM_PIPELINES': { 
            'LaGou.pipelines.LGMysqlPipeline': 1 
        }, 
        'DEFAULT_REQUEST_HEADERS': { 
                'Accept': 'application/json, text/javascript, */*; q=0.01', 
                'Accept-Encoding': 'gzip, deflate, br', 
                'Accept-Language': 'zh-CN,zh;q=0.8', 
                'Connection': 'keep-alive', 
                'cookie': '_ga=GA1.2.1809666113.1526002829; user_trace_token=20180511094034-4c4d62d7-54bc-11e8-949f-525400f775ce; LGUID=20180511094034-4c4d6608-54bc-11e8-949f-525400f775ce; LG_LOGIN_USER_ID=537d2089412cae011d73a44bb8911986e2cf8ecc81522b3c; JSESSIONID=ABAAABAAAGFABEF2F6A133027057686F2D420CEB60B7F87; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1541598090,1542114329,1542774094,1543386087; _gid=GA1.2.118340539.1543386087; index_location_city=%E5%85%A8%E5%9B%BD; X_HTTP_TOKEN=fb416e9ede186e36ef1a080ebf43ceba; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22167593fd949988-079e956e17a40f-4313362-2073600-167593fd94a1f2%22%2C%22%24device_id%22%3A%22167593fd949988-079e956e17a40f-4313362-2073600-167593fd94a1f2%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; LGSID=20181128222414-482f1608-f319-11e8-8c3d-5254005c3644; TG-TRACK-CODE=index_navigation; SEARCH_ID=8304e9b43803439494c1c91c06395eca; _gat=1; LGRID=20181128233044-924a1586-f322-11e8-8c40-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543419045', 
                'Host': 'www.lagou.com', 
                'Origin': 'https://www.lagou.com', 
                'Referer': 'https://www.lagou.com/', 
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 
            } 
        } 
  
  
    rules = ( 
        Rule(LinkExtractor(allow=(r'zhaopin/.*',)), follow=True), 
        Rule(LinkExtractor(allow=(r'gongsi/j.*',)), follow=True), 
        Rule(LinkExtractor(allow=r'jobs/.+html'), callback='parse_item', follow=True), 
    ) 
  
    def parse_item(self, response): 
        lagou_item = LG_item() 
  
        try: 
            # 提取内容 
            title = response.css(".job-name::attr(title)").extract()[0] 
            url = response.url 
            url_object_id = get_md5(url) 
            salary,job_city,work_years,degree_need,job_type = response.xpath('//dd[@class="job_request"]/p/span/text()').extract() 
            tags = '|'.join(response.css('.position-label.clearfix li::text').extract()) 
            pulish_time = response.css('.publish_time::text').extract()[0] 
            job_advantage =response.css('.job-advantage p::text').extract()[0] 
            job_desc = response.css('.job_bt div').extract()[0] 
            job_addr =re.sub(r'<.*?>|\n| |查看地图','', response.css('.work_addr').extract()[0]) 
            company_name =response.css('#job_company dt img::attr(alt)').extract()[0] 
            company_url = response.xpath('//dl[@class="job_company"]/dd//li[4]/a/text()').extract_first("没有") 
            crawl_time = datetime.datetime.now() 
  
  
            lagou_item['title']= title 
            lagou_item['url']= url 
            lagou_item['url_object_id']= url_object_id 
            lagou_item['salary']= salary 
            lagou_item['job_city']= job_city 
            lagou_item['work_years']= work_years 
            lagou_item['degree_need']= degree_need 
            lagou_item['job_type']= job_type 
            lagou_item['tags']= tags 
            lagou_item['pulish_time']= pulish_time 
            lagou_item['job_advantage']= job_advantage 
            lagou_item['job_desc']= job_desc 
            lagou_item['job_addr']= job_addr 
            lagou_item['company_name']= company_name 
            lagou_item['company_url']= company_url 
            lagou_item['crawl_time']= crawl_time 
  
            yield lagou_item 
        except Exception as e: 
            print('出错了: ',e) 
运行爬虫 
scrapy runspider 爬虫文件名 
让爬虫工作起来 
连接redis ,添加一个列表 
  
lpush lagou_url https://www.lagou.com 
  
---------------------  
作者:Test_C.  
来源:CSDN  
原文:https://blog.csdn.net/weixin_42544006/article/details/84724948  
版权声明:本文为博主原创文章,转载请附上博文链接! 
 
 |   
 
zan
  
 |