- 在线时间
- 791 小时
- 最后登录
- 2022-11-28
- 注册时间
- 2017-6-12
- 听众数
- 15
- 收听数
- 0
- 能力
- 120 分
- 体力
- 35372 点
- 威望
- 11 点
- 阅读权限
- 255
- 积分
- 13555
- 相册
- 0
- 日志
- 0
- 记录
- 1
- 帖子
- 621
- 主题
- 542
- 精华
- 10
- 分享
- 0
- 好友
- 225
TA的每日心情 | 开心 2020-11-14 17:15 |
---|
签到天数: 74 天 [LV.6]常住居民II
群组: 2019美赛冲刺课程 群组: 站长地区赛培训 群组: 2019考研数学 桃子老师 群组: 2018教师培训(呼伦贝 群组: 2019考研数学 站长系列 |
发表于 2018-12-7 10:03
|显示全部楼层
|
|邮箱已经成功绑定
settings文件添加以下内容
# ---------------------redis 配置--------------------------
# url指纹过滤器
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 设置爬虫是否可以中断
SCHEDULER_PERSIST = True
# 设置请求队列类型
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue" # 按优先级入队列
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue" # 按照队列模式
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack" # 按照栈进行请求的调度
# 配置redis管道文件,权重数字相对最大
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 999, # redis管道文件,自动把数据加载到redis
}
# redis 连接配置
# 没密码
# REDIS_HOST = '127.0.0.1'
# REDIS_PORT = 6379
# 有密码
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
REDIS_PARAMS = {
'password' :'', # 密码
'db' : 0 # 指定使用哪个数据库
}
爬虫文件
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import LG_item
from ..tools.get_md5 import get_md5
import datetime
import re
from scrapy_redis.spiders import RedisCrawlSpider
class LagouSpider(RedisCrawlSpider):
name = 'lagou'
allowed_domains = ['lagou.com']
# start_urls = ['https://www.lagou.com','https://www.lagou.com/gongsi/']
# redis 关键字
redis_key = 'lagou_url'
custom_settings = {
"COOKIES_ENABLED": False,
# "DOWNLOAD_DELAY": 0.2,
# "CONCURRENT_REQUESTS" : 32,
# 管道
'ITEM_PIPELINES': {
'LaGou.pipelines.LGMysqlPipeline': 1
},
'DEFAULT_REQUEST_HEADERS': {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Connection': 'keep-alive',
'cookie': '_ga=GA1.2.1809666113.1526002829; user_trace_token=20180511094034-4c4d62d7-54bc-11e8-949f-525400f775ce; LGUID=20180511094034-4c4d6608-54bc-11e8-949f-525400f775ce; LG_LOGIN_USER_ID=537d2089412cae011d73a44bb8911986e2cf8ecc81522b3c; JSESSIONID=ABAAABAAAGFABEF2F6A133027057686F2D420CEB60B7F87; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1541598090,1542114329,1542774094,1543386087; _gid=GA1.2.118340539.1543386087; index_location_city=%E5%85%A8%E5%9B%BD; X_HTTP_TOKEN=fb416e9ede186e36ef1a080ebf43ceba; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22167593fd949988-079e956e17a40f-4313362-2073600-167593fd94a1f2%22%2C%22%24device_id%22%3A%22167593fd949988-079e956e17a40f-4313362-2073600-167593fd94a1f2%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; LGSID=20181128222414-482f1608-f319-11e8-8c3d-5254005c3644; TG-TRACK-CODE=index_navigation; SEARCH_ID=8304e9b43803439494c1c91c06395eca; _gat=1; LGRID=20181128233044-924a1586-f322-11e8-8c40-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543419045',
'Host': 'www.lagou.com',
'Origin': 'https://www.lagou.com',
'Referer': 'https://www.lagou.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
}
}
rules = (
Rule(LinkExtractor(allow=(r'zhaopin/.*',)), follow=True),
Rule(LinkExtractor(allow=(r'gongsi/j.*',)), follow=True),
Rule(LinkExtractor(allow=r'jobs/.+html'), callback='parse_item', follow=True),
)
def parse_item(self, response):
lagou_item = LG_item()
try:
# 提取内容
title = response.css(".job-name::attr(title)").extract()[0]
url = response.url
url_object_id = get_md5(url)
salary,job_city,work_years,degree_need,job_type = response.xpath('//dd[@class="job_request"]/p/span/text()').extract()
tags = '|'.join(response.css('.position-label.clearfix li::text').extract())
pulish_time = response.css('.publish_time::text').extract()[0]
job_advantage =response.css('.job-advantage p::text').extract()[0]
job_desc = response.css('.job_bt div').extract()[0]
job_addr =re.sub(r'<.*?>|\n| |查看地图','', response.css('.work_addr').extract()[0])
company_name =response.css('#job_company dt img::attr(alt)').extract()[0]
company_url = response.xpath('//dl[@class="job_company"]/dd//li[4]/a/text()').extract_first("没有")
crawl_time = datetime.datetime.now()
lagou_item['title']= title
lagou_item['url']= url
lagou_item['url_object_id']= url_object_id
lagou_item['salary']= salary
lagou_item['job_city']= job_city
lagou_item['work_years']= work_years
lagou_item['degree_need']= degree_need
lagou_item['job_type']= job_type
lagou_item['tags']= tags
lagou_item['pulish_time']= pulish_time
lagou_item['job_advantage']= job_advantage
lagou_item['job_desc']= job_desc
lagou_item['job_addr']= job_addr
lagou_item['company_name']= company_name
lagou_item['company_url']= company_url
lagou_item['crawl_time']= crawl_time
yield lagou_item
except Exception as e:
print('出错了: ',e)
运行爬虫
scrapy runspider 爬虫文件名
让爬虫工作起来
连接redis ,添加一个列表
lpush lagou_url https://www.lagou.com
---------------------
作者:Test_C.
来源:CSDN
原文:https://blog.csdn.net/weixin_42544006/article/details/84724948
版权声明:本文为博主原创文章,转载请附上博文链接!
|
zan
|