博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
21天打造分布式爬虫-简书整站爬取(十)
阅读量:7251 次
发布时间:2019-06-29

本文共 6528 字,大约阅读时间需要 21 分钟。

10.1.简书整站爬虫

创建项目

scrapy startproject jianshuscrapy genspider -t crawl jianshu_spider "jianshu.com"

jianshu_spider.py

# -*- coding: utf-8 -*-import scrapyfrom scrapy.linkextractors import LinkExtractorfrom scrapy.spiders import CrawlSpider, Rulefrom jianshu.items import JianshuItemclass JianshuSpiderSpider(CrawlSpider):    name = 'jianshu_spider'    allowed_domains = ['jianshu.com']    start_urls = ['http://jianshu.com/']    rules = (        Rule(LinkExtractor(allow=r'.*/p/[0-9a-z][12].*'), callback='parse_detail', follow=True),    )    def parse_detail(self, response):        title = response.xpath("//h1[@class='title']/text()").get()        avatar = response.xpath("//a[@class='avatar']/img/@src").get()        author = response.xpath("//span[@class='name']/a/text()").get()        pub_time = response.xpath("//span[@class='publish-time']/text()").get().replace("*","")        #获取文章id        url = response.url        url1 = url.split("?")[0]        article_id = url1.split("/")[-1]        #文章内容,包括标签,而不是存文本内容        content = response.xpath("//div[@class='show-content']").get()        word_count = response.xpath("//span[@class='wordage']/text()").get()        comment_count = response.xpath("//span[@class='comments-count']/text()").get()        read_count = response.xpath("//span[@class='views-count']/text()").get()        like_count = response.xpath("//span[@class='likes-count']/text()").get()        subjects = ",".join(response.xpath("//div[@class='include-collection']/a/div/text()").getall())        item = JianshuItem(            title=title,            avatar=avatar,            pub_time=pub_time,            author=author,            origin_url=response.url,            content=content,            article_id=article_id,            subjects=subjects,            word_count=word_count,            comment_count=comment_count,            like_count=like_count,            read_count=read_count        )        yield item

items.py

import scrapyclass JianshuItem(scrapy.Item):    title = scrapy.Field()    content = scrapy.Field()    article_id = scrapy.Field()    origin_url = scrapy.Field()    author = scrapy.Field()    avatar = scrapy.Field()    pub_time = scrapy.Field()    read_count = scrapy.Field()    like_count = scrapy.Field()    word_count = scrapy.Field()    subjects = scrapy.Field()    comment_count = scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*-# import pymysql## class JianshuPipeline(object):#     def __init__(self):#         dbparams = {
# 'host': '127.0.0.1',# 'port': 3306,# 'user': 'root',# 'password': '123456',# 'database': 'jianshu',# 'charset': 'utf8'# }# self.conn = pymysql.connect(**dbparams)# self.cursor = self.conn.cursor()# self._sql = None## def process_item(self, item, spider):# self.cursor.execute(self.sql, (item['title'], item['content'],# item['author'], item['avatar'], item['pub_time'], item['article_id'],# item['origin_url'],item['like_count'],item['word_count'],item['subjects'],item['comment_count'],item['read_count']))# self.conn.commit()# return item## @property# def sql(self):# if not self._sql:# self._sql = """# insert into article(id,title,content,author,avatar,pub_time,# article_id,origin_url,like_count,word_count,subjects,comment_count,read_count) values(null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)# """# return self._sql# return self._sql# 采用twisted异步保存到mysqlimport pymysqlfrom twisted.enterprise import adbapifrom pymysql import cursorsclass JianshuTwistedPipeline(object): def __init__(self): dbparams = { 'host': '127.0.0.1', 'port': 3306, 'user': 'root', 'password': '123456', 'database': 'jianshu', 'charset': 'utf8', 'cursorclass': cursors.DictCursor } self.dbpool = adbapi.ConnectionPool("pymysql", **dbparams) self._sql = None @property def sql(self): if not self._sql: self._sql = """ insert into article(id,title,content,author,avatar,pub_time, article_id,origin_url,like_count,word_count,subjects,comment_count,read_count) values(null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) """ return self._sql return self._sql def process_item(self, item, spider): defer = self.dbpool.runInteraction(self.insert_item, item) defer.addErrback(self.handle_error, item, spider) def insert_item(self, cursor, item): cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'], item['pub_time'], item['article_id'], item['origin_url'],item['like_count'],item['word_count'],item['subjects'],item['comment_count'],item['read_count'])) def handle_error(self, error, item, spider): # print(error) pass

middlewares.py

# -*- coding: utf-8 -*-from selenium import webdriverimport timefrom scrapy.http.response.html import HtmlResponseclass SeleniumDownloadMiddleware(object):    def __init__(self):        self.driver = webdriver.Chrome()    def process_request(self,request,spider):        self.driver.get(request.url)        time.sleep(1)        try:            while True:                showmore = self.driver.find_element_by_class_name('show-more')                showmore.click()                time.sleep(0.5)                if not showmore:                    break        except:            pass        source = self.driver.page_source        response = HtmlResponse(url=self.driver.current_url,body=source,request=request,encoding='utf-8')        return response

settings.py

ROBOTSTXT_OBEY = FalseDOWNLOAD_DELAY = 1DEFAULT_REQUEST_HEADERS = {  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',  'Accept-Language': 'en',    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',}DOWNLOADER_MIDDLEWARES = {   'jianshu.middlewares.SeleniumDownloadMiddleware': 543,}ITEM_PIPELINES = {   # 'jianshu.pipelines.JianshuPipeline': 300,   'jianshu.pipelines.JianshuTwistedPipeline': 1,}

start.py

from scrapy import cmdlinecmdline.execute("scrapy crawl jianshu_spider".split())

转载地址:http://ujhbm.baihongyu.com/

你可能感兴趣的文章
nginx 无缝升级以及nginx添加支持模块,nginx添加proxy_cache 模块
查看>>
div弹层遮盖select组件
查看>>
算法学习笔记 day02
查看>>
ubuntu 16.04 安卓 4.4.4编译 + 调试
查看>>
Challenge: Data Exploration
查看>>
MongoDB 最佳实践及2.8版本特性与功能
查看>>
apache---httpd.conf详解
查看>>
UDP协议基础
查看>>
Java Web(6)论如何在 Listener获取Spring Bean
查看>>
基于Activiti扩展的工作流引擎OpenWebFlow
查看>>
工厂模式获取接口实现类对象
查看>>
Xcode快捷键—图文详解
查看>>
Ubuntu 18 安装MySQL8.0
查看>>
在类别中声明属性
查看>>
fatal: the remote end hung up unexpectedly
查看>>
Logback在配置中不能正确输出debug信息的解决
查看>>
repo代码解读
查看>>
CMake 手册详解(二十)
查看>>
Java设计模式(十一) 享元模式
查看>>
前端面试问题集
查看>>